From 926259c411c1022812ffb7fe88ca61f0180bd778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 14 Dec 2017 09:51:09 +0800 Subject: [PATCH 0001/1734] TST: test case for string --- tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py index 9f579495152..83d69c651ae 100644 --- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py +++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py @@ -364,6 +364,16 @@ class ScatterNdTest(test.TestCase): del input_ # input_ is not used in scatter_nd return array_ops.scatter_nd(indices, updates, shape) + def testString(self): + indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32) + updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string) + expected = np.array(["", "one", "", "three", "four", "", "", "seven"]) + scatter = self.scatter_nd(indices, updates, shape=(8,)) + + with self.test_session() as sess: + result = sess.run(scatter) + self.assertTrue(np.array_equal(result, expected)) + def testRank3ValidShape(self): indices = array_ops.zeros([2, 2, 2], dtypes.int32) updates = array_ops.zeros([2, 2, 2], dtypes.int32) From 005840c6e2d2a4c25ecd293162a38a79dedf1a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 14 Dec 2017 10:06:44 +0800 Subject: [PATCH 0002/1734] ENH: supports string for cpu --- tensorflow/core/kernels/scatter_nd_op.cc | 1 + tensorflow/core/kernels/scatter_nd_op_cpu_impl.h | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc index 3a95dd17733..0caa7bd3179 100644 --- a/tensorflow/core/kernels/scatter_nd_op.cc +++ b/tensorflow/core/kernels/scatter_nd_op.cc @@ -241,6 +241,7 @@ class ScatterNdUpdateOp : public OpKernel { TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU); TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU); TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU); +TF_CALL_string(REGISTER_SCATTER_ND_CPU); // Registers GPU kernels. #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h index cffc326174b..155d354d857 100644 --- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h @@ -160,6 +160,7 @@ struct ScatterNdFunctor { REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB); TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE); +REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD); TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH) #undef REGISTER_SCATTER_ND_MATH From d887d2bcfc819034b17e812a9a60460e2d61e447 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 14 Dec 2017 12:14:40 +0800 Subject: [PATCH 0003/1734] TST: ignore NonAliasingAdd --- tensorflow/python/kernel_tests/scatter_nd_ops_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py index 83d69c651ae..03b2f892c62 100644 --- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py +++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py @@ -594,6 +594,10 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest): shape, dtype=updates.dtype)) return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates) + def testString(self): + # Not supported yet. + pass + if __name__ == "__main__": test.main() From 4b697e0d9472215c706bdb36bb72986cdce78edd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 14 Dec 2017 13:51:34 +0800 Subject: [PATCH 0004/1734] DOC: modify document --- tensorflow/core/ops/array_ops.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 5a31f433cee..933ebe6b631 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -5332,12 +5332,13 @@ REGISTER_OP("ScatterNd") .Attr("Tindices: {int32, int64}") .SetShapeFn(ScatterNdShape) .Doc(R"doc( -Scatter `updates` into a new (initially zero) tensor according to `indices`. +Scatter `updates` into a new (initially zero for numeric, empty for string) +tensor according to `indices`. -Creates a new tensor by applying sparse `updates` to individual -values or slices within a zero tensor of the given `shape` according to -indices. This operator is the inverse of the @{tf.gather_nd} operator which -extracts values or slices from a given tensor. +Creates a new tensor by applying sparse `updates` to individual values or +slices within a zero (or empty string) tensor of the given `shape` +according to indices. This operator is the inverse of the @{tf.gather_nd} +operator which extracts values or slices from a given tensor. **WARNING**: The order in which updates are applied is nondeterministic, so the output will be nondeterministic if `indices` contains duplicates. From 597403e03680d69b72dbfa669f7bbdc77ce21ec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 20 Dec 2017 16:34:48 +0800 Subject: [PATCH 0005/1734] CLN: conform docstring --- tensorflow/core/ops/array_ops.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 933ebe6b631..89b6eb7162c 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -5332,13 +5332,12 @@ REGISTER_OP("ScatterNd") .Attr("Tindices: {int32, int64}") .SetShapeFn(ScatterNdShape) .Doc(R"doc( -Scatter `updates` into a new (initially zero for numeric, empty for string) -tensor according to `indices`. +Scatter `updates` into a new empty tensor according to `indices`. Creates a new tensor by applying sparse `updates` to individual values or -slices within a zero (or empty string) tensor of the given `shape` -according to indices. This operator is the inverse of the @{tf.gather_nd} -operator which extracts values or slices from a given tensor. +slices within a tensor (initially zero for numeric, empty for string) of +the given `shape` according to indices. This operator is the inverse of the +@{tf.gather_nd} operator which extracts values or slices from a given tensor. **WARNING**: The order in which updates are applied is nondeterministic, so the output will be nondeterministic if `indices` contains duplicates. From 736e8c4ccb16718d11cf7c8e1fac843bf6e388a7 Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Wed, 14 Feb 2018 18:26:20 +0900 Subject: [PATCH 0006/1734] fix typo --- tensorflow/core/lib/io/record_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc index 3657243c5d3..ebc56482699 100644 --- a/tensorflow/core/lib/io/record_writer.cc +++ b/tensorflow/core/lib/io/record_writer.cc @@ -49,7 +49,7 @@ RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions( #endif // IS_SLIM_BUILD } else if (compression_type != compression::kNone) { LOG(ERROR) << "Unsupported compression_type:" << compression_type - << ". No comprression will be used."; + << ". No compression will be used."; } return options; } From 617fa4e5fa634270c36a2a8762e6ce96bd38f2f8 Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Wed, 14 Feb 2018 18:35:31 +0900 Subject: [PATCH 0007/1734] fix typo --- tensorflow/contrib/makefile/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md index b0228c54350..995230dfa84 100644 --- a/tensorflow/contrib/makefile/README.md +++ b/tensorflow/contrib/makefile/README.md @@ -155,7 +155,7 @@ CC_PREFIX=ccache tensorflow/contrib/makefile/build_all_android.sh -s tensorflow/ (add -T on subsequent builds to skip protobuf downloading/building) -#### Testing the the CUDA-enabled benchmark via adb: +#### Testing the CUDA-enabled benchmark via adb: Build binaries first as above, then run: ```bash From b81aaac898d93e17b4a280bb02547d2a60d490cb Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 15 Feb 2018 08:28:12 +0000 Subject: [PATCH 0008/1734] Fix warnings in tf.contrib.bayesflow.monte_carlo.expectation This fix fixes several warnings in tf.contrib.bayesflow.monte_carlo.expectation by switching to keepdims for tf.reduce_mean. Signed-off-by: Yong Tang --- tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py index 985177e897f..5263e87ae68 100644 --- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py +++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py @@ -328,7 +328,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, if not callable(f): raise ValueError('`f` must be a callable function.') if use_reparametrization: - return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims) + return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims) else: if not callable(log_prob): raise ValueError('`log_prob` must be a callable function.') @@ -348,7 +348,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, # "Is there a floating point value of x, for which x-x == 0 is false?" # http://stackoverflow.com/q/2686644 fx += stop(fx) * (logpx - stop(logpx)) # Add zeros_like(logpx). - return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims) + return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims) def _sample_mean(values): From 9c272adf248228408448db6219b238145f5a02ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Fri, 16 Feb 2018 10:38:50 +0800 Subject: [PATCH 0009/1734] DOC: move doc to api def file --- .../core/api_def/base_api/api_def_ScatterNd.pbtxt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt index 4cb8c064fce..4e95895f548 100644 --- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt @@ -25,12 +25,12 @@ A new tensor with the given shape and updates applied according to the indices. END } - summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`." + summary: "Scatter `updates` into a new empty tensor according to `indices`." description: < Date: Sun, 25 Feb 2018 21:39:52 +0900 Subject: [PATCH 0010/1734] fix typo --- .../python/kernel_tests/linalg/linear_operator_diag_test.py | 2 +- tensorflow/python/ops/linalg/linear_operator_diag.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py index 343d1584988..8cb9f9e6213 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py @@ -129,7 +129,7 @@ class LinearOperatorDiagTest( with self.test_session() as sess: x = random_ops.random_normal(shape=(2, 2, 3, 4)) - # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve + # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve # and matmul with 'x' as the argument. diag = random_ops.random_uniform(shape=(2, 1, 3)) operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True) diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py index b3ec3d5b7cf..e180e830263 100644 --- a/tensorflow/python/ops/linalg/linear_operator_diag.py +++ b/tensorflow/python/ops/linalg/linear_operator_diag.py @@ -67,7 +67,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator): operator = LinearOperatorDiag(diag) # Create a shape [2, 1, 4, 2] vector. Note that this shape is compatible - # since the batch dimensions, [2, 1], are brodcast to + # since the batch dimensions, [2, 1], are broadcast to # operator.batch_shape = [2, 3]. y = tf.random_normal(shape=[2, 1, 4, 2]) x = operator.solve(y) From b569035378ef4a8595c64e5f398d74244cac376e Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Sun, 25 Feb 2018 21:44:12 +0900 Subject: [PATCH 0011/1734] fix typo --- tensorflow/contrib/slim/python/slim/data/parallel_reader.py | 2 +- tensorflow/python/ops/distributions/special_math.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py index ad5e9854871..b3343aef47d 100644 --- a/tensorflow/contrib/slim/python/slim/data/parallel_reader.py +++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader.py @@ -221,7 +221,7 @@ def parallel_read(data_sources, the data will be cycled through indefinitely. num_readers: a integer, number of Readers to create. reader_kwargs: an optional dict, of kwargs for the reader. - shuffle: boolean, wether should shuffle the files and the records by using + shuffle: boolean, whether should shuffle the files and the records by using RandomShuffleQueue as common_queue. dtypes: A list of types. The length of dtypes must equal the number of elements in each record. If it is None it will default to diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py index bed4cbb2c1a..1d605c5dfcc 100644 --- a/tensorflow/python/ops/distributions/special_math.py +++ b/tensorflow/python/ops/distributions/special_math.py @@ -213,7 +213,7 @@ def _ndtri(p): # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z), # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different - # arrays based on wether p < exp(-32). + # arrays based on whether p < exp(-32). z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp)) first_term = z - math_ops.log(z) / z second_term_small_p = (_create_polynomial(1. / z, p2) From ef4e8ad826c8946f8ff3e0f7e1b3bb3bec61010c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 21 Feb 2018 15:06:04 +0800 Subject: [PATCH 0012/1734] CLN: extract ApplyAdamBaseOp --- tensorflow/core/kernels/training_ops.cc | 146 +++++++++++++++--- tensorflow/core/kernels/training_ops.h | 13 ++ .../core/kernels/training_ops_gpu.cu.cc | 30 ++++ tensorflow/core/ops/training_ops.cc | 37 +++++ 4 files changed, 202 insertions(+), 24 deletions(-) diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 233aa03c323..7d383d980a5 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -328,6 +328,45 @@ struct ApplyAdamSYCL { template struct ApplyAdam : ApplyAdamNonCuda {}; +template +struct ApplyAdaMaxNonCuda { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::ConstScalar beta1_power, + typename TTypes::ConstScalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov) { + if (use_nesterov) { + LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it."; + } + m.device(d) += (grad - m) * (T(1) - beta1()); + // v == u + v.device(d) = (beta2() * v).cwiseMax(grad.abs()); + // var == θ + var.device(d) -= (lr * m) / ((T(1) - beta1_power()) * v); + } +}; + +#ifdef TENSORFLOW_USE_SYCL +template +struct ApplyAdaMaxSYCL { + void operator()(const SYCLDevice& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + T beta1_power, T beta2_power, T lr, T beta1, T beta2, + T epsilon, typename TTypes::ConstFlat grad) { + m.device(d) += (grad - m) * (T(1) - beta1); + v.device(d) = (beta2 * v).cwiseMax(grad.abs()); + var.device(d) -= (lr * m) / ((T(1) - beta1_power) * v); + } +}; +#endif // TENSORFLOW_USE_SYCL + +template +struct ApplyAdaMax : ApplyAdaMaxNonCuda {}; + template struct ApplyRMSProp { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -2477,10 +2516,12 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS -template -class ApplyAdamOp : public OpKernel { +template + class Functor> +class ApplyAdamBaseOp : public OpKernel { public: - explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_)); } @@ -2553,11 +2594,11 @@ class ApplyAdamOp : public OpKernel { grad.shape().DebugString())); const Device& device = ctx->template eigen_device(); - functor::ApplyAdam()( - device, var.flat(), m.flat(), v.flat(), - beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), - beta1.scalar(), beta2.scalar(), epsilon.scalar(), - grad.flat(), use_nesterov_); + auto functor = Functor(); + functor(device, var.flat(), m.flat(), v.flat(), + beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + grad.flat(), use_nesterov_); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } @@ -2568,10 +2609,11 @@ class ApplyAdamOp : public OpKernel { }; #ifdef TENSORFLOW_USE_SYCL -template -class ApplyAdamOp : public OpKernel { +template class Functor> +class ApplyAdamBaseOp : public OpKernel { public: - explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } @@ -2672,9 +2714,10 @@ class ApplyAdamOp : public OpKernel { var.shape().DebugString(), " ", grad.shape().DebugString())); - functor::ApplyAdamSYCL()(device, var.flat(), m.flat(), v.flat(), - beta1_power, beta2_power, lr, beta1, beta2, - epsilon, grad.flat()); + auto functor = Functor(); + functor(device, var.flat(), m.flat(), v.flat(), + beta1_power, beta2_power, lr, beta1, beta2, + epsilon, grad.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } @@ -2684,28 +2727,28 @@ class ApplyAdamOp : public OpKernel { }; #endif // TENSORFLOW_USE_SYCL -#define REGISTER_KERNELS(D, T) \ +#define REGISTER_KERNELS(D, T, F) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyAdamOp); \ + ApplyAdamBaseOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdam") \ .HostMemory("var") \ .HostMemory("m") \ .HostMemory("v") \ .Device(DEVICE_##D) \ .TypeConstraint("T"), \ - ApplyAdamOp); -#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); - + ApplyAdamBaseOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdam); TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); - +#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdamSYCL); TF_CALL_float(REGISTER_SYCL_KERNELS); TF_CALL_double(REGISTER_SYCL_KERNELS); +#undef REGISTER_SYCL_KERNELS #endif #if GOOGLE_CUDA @@ -2730,11 +2773,66 @@ DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor -REGISTER_KERNELS(GPU, Eigen::half); -REGISTER_KERNELS(GPU, float); -REGISTER_KERNELS(GPU, double); +#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdam); +REGISTER_GPU_KERNELS(Eigen::half); +REGISTER_GPU_KERNELS(float); +REGISTER_GPU_KERNELS(double); +#undef REGISTER_GPU_KERNELS #endif +#undef REGISTER_KERNELS + +#define REGISTER_KERNELS(D, T, F) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdamBaseOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax") \ + .HostMemory("var") \ + .HostMemory("m") \ + .HostMemory("v") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyAdamBaseOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdaMax); +TF_CALL_half(REGISTER_CPU_KERNELS); +TF_CALL_float(REGISTER_CPU_KERNELS); +TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS + +#ifdef TENSORFLOW_USE_SYCL +#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdaMaxSYCL); +TF_CALL_float(REGISTER_SYCL_KERNELS); +TF_CALL_double(REGISTER_SYCL_KERNELS); +#undef REGISTER_SYCL_KERNELS +#endif + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAdaMax::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::Flat v, \ + typename TTypes::ConstScalar beta1_power, \ + typename TTypes::ConstScalar beta2_power, \ + typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar beta1, \ + typename TTypes::ConstScalar beta2, \ + typename TTypes::ConstScalar epsilon, \ + typename TTypes::ConstFlat grad, bool use_nesterov); \ + extern template struct ApplyAdaMax; +DECLARE_GPU_SPEC(Eigen::half); +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdaMax); +REGISTER_GPU_KERNELS(Eigen::half); +REGISTER_GPU_KERNELS(float); +REGISTER_GPU_KERNELS(double); +#undef REGISTER_GPU_KERNELS +#endif #undef REGISTER_KERNELS template diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index 7ee956053ab..46a52902108 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -139,6 +139,19 @@ struct ApplyAdam { typename TTypes::ConstFlat grad, bool use_nesterov); }; +template +struct ApplyAdaMax { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::ConstScalar beta1_power, + typename TTypes::ConstScalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov); +}; + template struct ApplyRMSProp { void operator()(const Device& d, typename TTypes::Flat var, diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index 0376a3b2c60..1776c108ab2 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -142,6 +142,32 @@ struct ApplyAdam { } }; +template +struct ApplyAdaMax { + void operator()(const GPUDevice& d, typename TTypes::Flat var, + typename TTypes::Flat m, typename TTypes::Flat v, + typename TTypes::ConstScalar beta1_power, + typename TTypes::ConstScalar beta2_power, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar beta1, + typename TTypes::ConstScalar beta2, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad, bool use_nesterov) { + Eigen::array::Tensor::Index, 1> bcast; + bcast[0] = grad.dimension(0); + Eigen::Sizes<1> single; + const auto one = static_cast(1.0); + m.device(d) = + m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * + (grad - m); + v.device(d) = + (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs()); + var.device(d) -= + (lr * m) / ((beta1_power.constant(one) - + beta1_power).reshape(single).broadcast(bcast) * v); + } +}; + template struct ApplyRMSProp { void operator()(const GPUDevice& d, typename TTypes::Flat var, @@ -278,6 +304,10 @@ template struct functor::ApplyAdam; template struct functor::ApplyAdam; template struct functor::ApplyAdam; +template struct functor::ApplyAdaMax; +template struct functor::ApplyAdaMax; +template struct functor::ApplyAdaMax; + template struct functor::ApplyRMSProp; template struct functor::ApplyRMSProp; template struct functor::ApplyRMSProp; diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index 6ce9595fb60..6f107db3eac 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -737,6 +737,43 @@ REGISTER_OP("ResourceApplyAdam") return ApplyAdamShapeFn(c, false /* sparse */); }); +REGISTER_OP("ApplyAdaMax") + .Input("var: Ref(T)") + .Input("m: Ref(T)") + .Input("v: Ref(T)") + .Input("beta1_power: T") + .Input("beta2_power: T") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamShapeFn(c, false /* sparse */); + }); + +REGISTER_OP("ResourceApplyAdaMax") + .Input("var: resource") + .Input("m: resource") + .Input("v: resource") + .Input("beta1_power: T") + .Input("beta2_power: T") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamShapeFn(c, false /* sparse */); + }); + static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; ShapeHandle s = ShapeOrHandleShape(c, 0); // var From 4d31dac8111b963ed427969c71c6957c929d3e5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 21 Feb 2018 20:29:46 +0800 Subject: [PATCH 0013/1734] ENH: add AdaMaxOptimizer in python side --- tensorflow/contrib/opt/BUILD | 20 +++ tensorflow/contrib/opt/__init__.py | 2 + .../contrib/opt/python/training/adamax.py | 72 ++++++++++ .../opt/python/training/adamax_test.py | 124 ++++++++++++++++++ tensorflow/core/kernels/training_ops.cc | 2 +- 5 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 tensorflow/contrib/opt/python/training/adamax.py create mode 100644 tensorflow/contrib/opt/python/training/adamax_test.py diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index 86ceda71b70..a86d150f7a0 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -14,6 +14,7 @@ py_library( name = "opt_py", srcs = [ "__init__.py", + "python/training/adamax.py", "python/training/addsign.py", "python/training/drop_stale_gradient_optimizer.py", "python/training/elastic_average_optimizer.py", @@ -48,6 +49,25 @@ py_library( ], ) +py_test( + name = "adamax_test", + srcs = ["python/training/adamax_test.py"], + srcs_version = "PY2AND3", + tags = [ + "no_oss", # b/73507407 + "notsan", # b/31055119 + ], + deps = [ + ":opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + ], +) + py_test( name = "external_optimizer_test", srcs = ["python/training/external_optimizer_test.py"], diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py index 6c1bb1adc09..4c13c8e2471 100644 --- a/tensorflow/contrib/opt/__init__.py +++ b/tensorflow/contrib/opt/__init__.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function # pylint: disable=wildcard-import +from tensorflow.contrib.opt.python.training.adamax import * from tensorflow.contrib.opt.python.training.addsign import * from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import * from tensorflow.contrib.opt.python.training.external_optimizer import * @@ -36,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = [ + 'AdaMaxOptimizer', 'PowerSignOptimizer', 'AddSignOptimizer', 'DelayCompensatedGradientDescentOptimizer', diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py new file mode 100644 index 00000000000..4e0c541d3a1 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -0,0 +1,72 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""AdaMax for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.eager import context +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.training import optimizer +from tensorflow.python.training import adam +from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export + + +@tf_export("train.AdaMaxOptimizer") +class AdaMaxOptimizer(adam.AdamOptimizer): + """Optimizer that implements the AdaMax algorithm. + + See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) + ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). + """ + + def _apply_dense(self, grad, var): + m = self.get_slot(var, "m") + v = self.get_slot(var, "v") + beta1_power, beta2_power = self._get_beta_accumulators() + return training_ops.apply_ada_max( + var, m, v, + math_ops.cast(beta1_power, var.dtype.base_dtype), + math_ops.cast(beta2_power, var.dtype.base_dtype), + math_ops.cast(self._lr_t, var.dtype.base_dtype), + math_ops.cast(self._beta1_t, var.dtype.base_dtype), + math_ops.cast(self._beta2_t, var.dtype.base_dtype), + math_ops.cast(self._epsilon_t, var.dtype.base_dtype), + grad, use_locking=self._use_locking).op + + def _resource_apply_dense(self, grad, var): + m = self.get_slot(var, "m") + v = self.get_slot(var, "v") + beta1_power, beta2_power = self._get_beta_accumulators() + return training_ops.resource_apply_ada_max( + var.handle, m.handle, v.handle, + math_ops.cast(beta1_power, grad.dtype.base_dtype), + math_ops.cast(beta2_power, grad.dtype.base_dtype), + math_ops.cast(self._lr_t, grad.dtype.base_dtype), + math_ops.cast(self._beta1_t, grad.dtype.base_dtype), + math_ops.cast(self._beta2_t, grad.dtype.base_dtype), + math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), + grad, use_locking=self._use_locking) + + def _apply_sparse_shared(self, grad, var, indices, scatter_add): + raise NotImplementedError() + + def _apply_sparse(self, grad, var): + raise NotImplementedError() diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py new file mode 100644 index 00000000000..a1499118dd3 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/adamax_test.py @@ -0,0 +1,124 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for AdaMax.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.opt.python.training import adamax +from tensorflow.python.client import session +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +def adamax_update_numpy(param, + g_t, + t, + m, + v, + alpha=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8): + m_t = beta1 * m + (1 - beta1) * g_t + v_t = np.maximum(beta2 * v, np.abs(g_t)) + param_t = param - (alpha / (1 - beta1**t)) * m_t / v_t + return param_t, m_t, v_t + + +class AdaMaxOptimizerTest(test.TestCase): + + def doTestBasic(self, use_resource=False): + for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]): + with self.test_session(graph=ops.Graph()): + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + + if use_resource: + var0 = resource_variable_ops.ResourceVariable( + var0_np, name="var0_%d" % i) + var1 = resource_variable_ops.ResourceVariable( + var1_np, name="var1_%d" % i) + else: + var0 = variables.Variable(var0_np) + var1 = variables.Variable(var1_np) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + + opt = adamax.AdaMaxOptimizer() + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + opt_variables = opt.variables() + beta1_power, beta2_power = opt._get_beta_accumulators() + self.assertTrue(beta1_power is not None) + self.assertTrue(beta2_power is not None) + self.assertIn(beta1_power, opt_variables) + self.assertIn(beta2_power, opt_variables) + + with ops.Graph().as_default(): + # Shouldn't return non-slot variables from other graphs. + self.assertEqual(0, len(opt.variables())) + + if context.in_graph_mode(): + self.evaluate(variables.global_variables_initializer()) + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) + + beta1_power, beta2_power = opt._get_beta_accumulators() + + # Run 3 steps of Adam + for t in range(1, 4): + if context.in_graph_mode(): + self.evaluate(update) + elif t > 1: + opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + + self.assertAllCloseAccordingToType(0.9**(t + 1), + self.evaluate(beta1_power)) + self.assertAllCloseAccordingToType(0.999**(t + 1), + self.evaluate(beta2_power)) + + var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) + if use_resource: + self.assertEqual("var0_%d/Adam:0" % (i,), + opt.get_slot(var=var0, name="m").name) + + def testBasic(self): + with self.test_session(): + self.doTestBasic(use_resource=False) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 7d383d980a5..b3b53d9ee04 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -346,7 +346,7 @@ struct ApplyAdaMaxNonCuda { // v == u v.device(d) = (beta2() * v).cwiseMax(grad.abs()); // var == θ - var.device(d) -= (lr * m) / ((T(1) - beta1_power()) * v); + var.device(d) -= (lr() * m) / ((T(1) - beta1_power()) * v); } }; From ba258d530f1af5fbcc8c1b72637dc7b2177a48c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Fri, 2 Mar 2018 19:33:30 +0800 Subject: [PATCH 0014/1734] ENH: support sparse grad --- .../contrib/opt/python/training/adamax.py | 51 +++++++++++++++++-- .../opt/python/training/adamax_test.py | 2 +- tensorflow/core/kernels/training_ops.cc | 4 +- .../core/kernels/training_ops_gpu.cu.cc | 5 +- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index 4e0c541d3a1..137fce769f7 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -18,12 +18,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.eager import context from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import resource_variable_ops -from tensorflow.python.training import optimizer +from tensorflow.python.ops import state_ops from tensorflow.python.training import adam from tensorflow.python.training import training_ops from tensorflow.python.util.tf_export import tf_export @@ -65,8 +65,49 @@ class AdaMaxOptimizer(adam.AdamOptimizer): math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), grad, use_locking=self._use_locking) - def _apply_sparse_shared(self, grad, var, indices, scatter_add): - raise NotImplementedError() + def _apply_sparse_shared(self, grad, var, indices, + scatter_add, scatter_update): + beta1_power, beta2_power = self._get_beta_accumulators() + beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) + beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) + lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) + beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) + epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, "m") + m_slice = array_ops.gather(m, indices) + m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t) + with ops.control_dependencies([m_t_slice]): + m_t = scatter_update(m, indices, m_t_slice) + # u_t = max(beta2 * u, abs(g_t)) + v = self.get_slot(var, "v") + v_slice = array_ops.gather(v, indices) + v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad)) + with ops.control_dependencies([v_t_slice]): + v_t = scatter_update(v, indices, v_t_slice) + # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t + var_slice = -lr_t / (1 - beta1_power) * (m_t_slice / + (v_t_slice + epsilon_t)) + with ops.control_dependencies([var_slice]): + var_update = scatter_add(var, indices, var_slice) + return control_flow_ops.group(*[var_update, m_t, v_t]) def _apply_sparse(self, grad, var): - raise NotImplementedError() + return self._apply_sparse_shared( + grad.values, var, grad.indices, + lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda + x, i, v, use_locking=self._use_locking), + lambda x, i, v: state_ops.scatter_update( # pylint: disable=g-long-lambda + x, i, v, use_locking=self._use_locking)) + + def _resource_scatter_update(self, x, i, v): + with ops.control_dependencies( + [resource_variable_ops.resource_scatter_update( + x.handle, i, v)]): + return x.value() + + def _resource_apply_sparse(self, grad, var, indices): + return self._apply_sparse_shared( + grad, var, indices, + self._resource_scatter_add, self._resource_scatter_update) diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py index a1499118dd3..0e2ba0987a7 100644 --- a/tensorflow/contrib/opt/python/training/adamax_test.py +++ b/tensorflow/contrib/opt/python/training/adamax_test.py @@ -45,7 +45,7 @@ def adamax_update_numpy(param, epsilon=1e-8): m_t = beta1 * m + (1 - beta1) * g_t v_t = np.maximum(beta2 * v, np.abs(g_t)) - param_t = param - (alpha / (1 - beta1**t)) * m_t / v_t + param_t = param - (alpha / (1 - beta1**t)) * m_t / (v_t + epsilon) return param_t, m_t, v_t diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index b3b53d9ee04..0387e3011ea 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -346,7 +346,7 @@ struct ApplyAdaMaxNonCuda { // v == u v.device(d) = (beta2() * v).cwiseMax(grad.abs()); // var == θ - var.device(d) -= (lr() * m) / ((T(1) - beta1_power()) * v); + var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon())); } }; @@ -359,7 +359,7 @@ struct ApplyAdaMaxSYCL { T epsilon, typename TTypes::ConstFlat grad) { m.device(d) += (grad - m) * (T(1) - beta1); v.device(d) = (beta2 * v).cwiseMax(grad.abs()); - var.device(d) -= (lr * m) / ((T(1) - beta1_power) * v); + var.device(d) -= lr / (T(1) - beta1_power) * (m / (v + epsilon)); } }; #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index 1776c108ab2..54c06b130ce 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -163,8 +163,9 @@ struct ApplyAdaMax { v.device(d) = (beta2.reshape(single).broadcast(bcast) * v).cwiseMax(grad.abs()); var.device(d) -= - (lr * m) / ((beta1_power.constant(one) - - beta1_power).reshape(single).broadcast(bcast) * v); + lr / (beta1_power.constant(one) - + beta1_power).reshape(single).broadcast(bcast) * + (m / (v + epsilon)); } }; From f6f5a6019970bb8d667819da7d6316a8088a0b78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 3 Mar 2018 10:02:43 +0800 Subject: [PATCH 0015/1734] DOC: add docment --- .../contrib/opt/python/training/adamax.py | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index 137fce769f7..ddae06bec76 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -29,7 +29,6 @@ from tensorflow.python.training import training_ops from tensorflow.python.util.tf_export import tf_export -@tf_export("train.AdaMaxOptimizer") class AdaMaxOptimizer(adam.AdamOptimizer): """Optimizer that implements the AdaMax algorithm. @@ -37,6 +36,56 @@ class AdaMaxOptimizer(adam.AdamOptimizer): ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). """ + def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, + use_locking=False, name="AdaMax"): + """Construct a new AdaMax optimizer. + + Initialization: + + ``` + m_0 <- 0 (Initialize initial 1st moment vector) + v_0 <- 0 (Initialize the exponentially weighted infinity norm) + t <- 0 (Initialize timestep) + ``` + + The update rule for `variable` with gradient `g` uses an optimization + described at the end of section7.1 of the paper: + + ``` + t <- t + 1 + lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) + + m_t <- beta1 * m_{t-1} + (1 - beta1) * g + v_t <- max(beta2 * v_{t-1}, abs(g)) + variable <- variable - lr_t / (1 - beta1^t) * m_t / (v_t + epsilon) + ``` + + Similar to AdamOptimizer, the epsilon is added for numerical stability + (especially to get rid of division by zero when v_t = 0). + + Contrast to AdamOptimizer, the sparse implementation of this algorithm + (used when the gradient is an IndexedSlices object, typically because of + `tf.gather` or an embedding lookup in the forward pass) only updates + variable slices and corresponding `m_t`, `v_t` terms when that part of + the variable was used in the forward pass. This means that the sparse + behavior is contrast to the dense behavior (similar to some momentum + implementations which ignore momentum unless a variable slice was actually + used). + + Args: + learning_rate: A Tensor or a floating point value. The learning rate. + beta1: A float value or a constant float tensor. + The exponential decay rate for the 1st moment estimates. + beta2: A float value or a constant float tensor. + The exponential decay rate for the exponentially weighted infinity norm. + epsilon: A small constant for numerical stability. + use_locking: If True use locks for update operations. + name: Optional name for the operations created when applying gradients. + Defaults to "AdaMax". + """ + super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2, + epsilon, use_locking, name) + def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") From f750e21a63c8836b9e7243ce786af2de3f65cc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 3 Mar 2018 12:31:54 +0800 Subject: [PATCH 0016/1734] TST: add more tests --- .../contrib/opt/python/training/adamax.py | 2 +- .../opt/python/training/adamax_test.py | 243 +++++++++++++++++- 2 files changed, 233 insertions(+), 12 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index ddae06bec76..36d49d4cbf8 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -159,4 +159,4 @@ class AdaMaxOptimizer(adam.AdamOptimizer): def _resource_apply_sparse(self, grad, var, indices): return self._apply_sparse_shared( grad, var, indices, - self._resource_scatter_add, self._resource_scatter_update) + self._resource_scatter_add, self._resource_scatter_update) diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py index 0e2ba0987a7..e91e5cb96a5 100644 --- a/tensorflow/contrib/opt/python/training/adamax_test.py +++ b/tensorflow/contrib/opt/python/training/adamax_test.py @@ -35,22 +35,142 @@ from tensorflow.python.platform import test def adamax_update_numpy(param, - g_t, - t, - m, - v, - alpha=0.001, - beta1=0.9, - beta2=0.999, - epsilon=1e-8): + g_t, + t, + m, + v, + alpha=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8): m_t = beta1 * m + (1 - beta1) * g_t v_t = np.maximum(beta2 * v, np.abs(g_t)) - param_t = param - (alpha / (1 - beta1**t)) * m_t / (v_t + epsilon) + param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon)) + return param_t, m_t, v_t + + +def adamax_sparse_update_numpy(param, + indices, + g_t, + t, + m, + v, + alpha=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8): + m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param) + m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t + v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t)) + param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) * + (m_t_slice / (v_t_slice + epsilon))) + m_t[indices] = m_t_slice + v_t[indices] = v_t_slice + param_t[indices] = param_t_slice return param_t, m_t, v_t class AdaMaxOptimizerTest(test.TestCase): + def doTestSparse(self, use_resource=False): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + # Initialize variables for numpy implementation. + zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype) + m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots() + var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + + if use_resource: + var0 = resource_variable_ops.ResourceVariable(var0_np) + var1 = resource_variable_ops.ResourceVariable(var1_np) + else: + var0 = variables.Variable(var0_np) + var1 = variables.Variable(var1_np) + grads0_np_indices = np.array([0, 1], dtype=np.int32) + grads0 = ops.IndexedSlices( + constant_op.constant(grads0_np), + constant_op.constant(grads0_np_indices), constant_op.constant([2])) + grads1_np_indices = np.array([2, 1], dtype=np.int32) + grads1 = ops.IndexedSlices( + constant_op.constant(grads1_np), + constant_op.constant(grads1_np_indices), constant_op.constant([2])) + opt = adamax.AdaMaxOptimizer() + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0, 3.0], var0.eval()) + self.assertAllClose([4.0, 5.0, 6.0], var1.eval()) + + beta1_power, beta2_power = opt._get_beta_accumulators() + + # Run 3 steps of AdaMax + for t in range(1, 4): + self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) + self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + update.run() + + var0_np, m0, v0 = adamax_sparse_update_numpy( + var0_np, grads0_np_indices, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_sparse_update_numpy( + var1_np, grads1_np_indices, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + + def testSparse(self): + self.doTestSparse(use_resource=False) + + def testResourceSparse(self): + self.doTestSparse(use_resource=True) + + def testSparseDevicePlacement(self): + for index_dtype in [dtypes.int32, dtypes.int64]: + with self.test_session(force_gpu=test.is_gpu_available()): + # If a GPU is available, tests that all optimizer ops can be placed on + # it (i.e. they have GPU kernels). + var = variables.Variable([[1.0], [2.0]]) + indices = constant_op.constant([0, 1], dtype=index_dtype) + gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices)) + optimizer = adamax.AdaMaxOptimizer(3.0) + minimize_op = optimizer.minimize(gathered_sum) + variables.global_variables_initializer().run() + minimize_op.run() + + def testSparseRepeatedIndices(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + repeated_index_update_var = variables.Variable( + [[1.0], [2.0]], dtype=dtype) + aggregated_update_var = variables.Variable( + [[1.0], [2.0]], dtype=dtype) + grad_repeated_index = ops.IndexedSlices( + constant_op.constant( + [0.1, 0.1], shape=[2, 1], dtype=dtype), + constant_op.constant([1, 1]), + constant_op.constant([2, 1])) + grad_aggregated = ops.IndexedSlices( + constant_op.constant( + [0.2], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), + constant_op.constant([2, 1])) + repeated_update = adamax.AdaMaxOptimizer().apply_gradients( + [(grad_repeated_index, repeated_index_update_var)]) + aggregated_update = adamax.AdaMaxOptimizer().apply_gradients( + [(grad_aggregated, aggregated_update_var)]) + variables.global_variables_initializer().run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + for _ in range(3): + repeated_update.run() + aggregated_update.run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + def doTestBasic(self, use_resource=False): for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]): with self.test_session(graph=ops.Graph()): @@ -93,7 +213,7 @@ class AdaMaxOptimizerTest(test.TestCase): beta1_power, beta2_power = opt._get_beta_accumulators() - # Run 3 steps of Adam + # Run 3 steps of AdaMax for t in range(1, 4): if context.in_graph_mode(): self.evaluate(update) @@ -112,13 +232,114 @@ class AdaMaxOptimizerTest(test.TestCase): self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) if use_resource: - self.assertEqual("var0_%d/Adam:0" % (i,), + self.assertEqual("var0_%d/AdaMax:0" % (i,), opt.get_slot(var=var0, name="m").name) def testBasic(self): with self.test_session(): self.doTestBasic(use_resource=False) + @test_util.run_in_graph_and_eager_modes(reset_test=True) + def testResourceBasic(self): + self.doTestBasic(use_resource=True) + + def testTensorLearningRate(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + + var0 = variables.Variable(var0_np) + var1 = variables.Variable(var1_np) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001)) + update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + beta1_power, beta2_power = opt._get_beta_accumulators() + + # Run 3 steps of AdaMax + for t in range(1, 4): + self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) + self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + update.run() + + var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + + def testSharing(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + # Initialize variables for numpy implementation. + m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 + var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + + var0 = variables.Variable(var0_np) + var1 = variables.Variable(var1_np) + grads0 = constant_op.constant(grads0_np) + grads1 = constant_op.constant(grads1_np) + opt = adamax.AdaMaxOptimizer() + update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + + beta1_power, beta2_power = opt._get_beta_accumulators() + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + # Run 3 steps of intertwined AdaMax1 and AdaMax2. + for t in range(1, 4): + self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) + self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + if t % 2 == 0: + update1.run() + else: + update2.run() + + var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) + var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) + + # Validate updated params + self.assertAllCloseAccordingToType(var0_np, var0.eval()) + self.assertAllCloseAccordingToType(var1_np, var1.eval()) + + def testTwoSessions(self): + optimizer = adamax.AdaMaxOptimizer() + g = ops.Graph() + with g.as_default(): + with session.Session(): + var0 = variables.Variable(np.array([1.0, 2.0]), name="v0") + grads0 = constant_op.constant(np.array([0.1, 0.1])) + optimizer.apply_gradients([(grads0, var0)]) + + gg = ops.Graph() + with gg.as_default(): + with session.Session(): + var0 = variables.Variable(np.array([1.0, 2.0]), name="v0") + grads0 = constant_op.constant(np.array([0.1, 0.1])) + + # If the optimizer saves any state not keyed by graph the following line + # fails. + optimizer.apply_gradients([(grads0, var0)]) + if __name__ == "__main__": test.main() From 8b5e4ad404ba16919ad4f17a763ee5383d61a400 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 3 Mar 2018 17:39:56 +0800 Subject: [PATCH 0017/1734] DOC: add apidef --- .../contrib/opt/python/training/adamax.py | 3 +- .../base_api/api_def_ApplyAdaMax.pbtxt | 89 +++++++++++++++++++ .../api_def_ResourceApplyAdaMax.pbtxt | 83 +++++++++++++++++ 3 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index 36d49d4cbf8..fe5522a1708 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -53,11 +53,10 @@ class AdaMaxOptimizer(adam.AdamOptimizer): ``` t <- t + 1 - lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) m_t <- beta1 * m_{t-1} + (1 - beta1) * g v_t <- max(beta2 * v_{t-1}, abs(g)) - variable <- variable - lr_t / (1 - beta1^t) * m_t / (v_t + epsilon) + variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon) ``` Similar to AdamOptimizer, the epsilon is added for numerical stability diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt new file mode 100644 index 00000000000..106c30ca83a --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt @@ -0,0 +1,89 @@ +op { + graph_op_name: "ApplyAdaMax" + in_arg { + name: "var" + description: < Date: Mon, 5 Mar 2018 17:41:00 +0000 Subject: [PATCH 0018/1734] Update the documentation of `softmax_cross_entropy` This fix updates the documentation of `softmax_cross_entropy`, and removed the shape restrictions of `onehot_labels` and `logits`. They only needs to be of the same shape, not necessary `[batch_size, num_classes]`. This fix fixes 16263. Signed-off-by: Yong Tang --- tensorflow/python/ops/losses/losses_impl.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py index 7386976e93f..04c13cb6c64 100644 --- a/tensorflow/python/ops/losses/losses_impl.py +++ b/tensorflow/python/ops/losses/losses_impl.py @@ -710,11 +710,16 @@ def softmax_cross_entropy( new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes + Note that `onehot_labels` and `logits` must have the same shape, + e.g. `[batch_size, num_classes]`. The shape of `weights` must be + broadcastable to loss, whose shape is decided by the shape of `logits`. + In case the shape of `logits` is `[batch_size, num_classes]`, loss is + a `Tensor` of shape `[batch_size]`. + Args: - onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels. - logits: `[batch_size, num_classes]` logits outputs of the network . - weights: Optional `Tensor` whose rank is either 0, or rank 1 and is - broadcastable to the loss which is a `Tensor` of shape `[batch_size]`. + onehot_labels: One-hot-encoded labels. + logits: Logits outputs of the network. + weights: Optional `Tensor` that is broadcastable to loss. label_smoothing: If greater than 0 then smooth the labels. scope: the scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. From f82d009d878dc675a307e69f89ba9f4dfdcd6c71 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Wed, 7 Mar 2018 21:58:39 +0800 Subject: [PATCH 0019/1734] Fix broken link of typical distributed configuration in graphs.md --- tensorflow/docs_src/programmers_guide/graphs.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index e69b717432e..ca74b175426 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -210,9 +210,8 @@ with tf.device("/device:GPU:0"): # Operations created in this context will be pinned to the GPU. result = tf.matmul(weights, img) ``` -If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed configuration}, -you might specify the job name and task ID to place variables on -a task in the parameter server job (`"/job:ps"`), and the other operations on + +If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on task in the worker job (`"/job:worker"`): ```python From 04b6127510793b4c5aaa540b60b68ffdf3fd48ce Mon Sep 17 00:00:00 2001 From: imsheridan Date: Wed, 7 Mar 2018 22:23:50 +0800 Subject: [PATCH 0020/1734] revert the minor space nit --- tensorflow/docs_src/programmers_guide/graphs.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index ca74b175426..3b5e3e5a9a1 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -210,8 +210,9 @@ with tf.device("/device:GPU:0"): # Operations created in this context will be pinned to the GPU. result = tf.matmul(weights, img) ``` - -If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on +If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, +you might specify the job name and task ID to place variables on +a task in the parameter server job (`"/job:ps"`), and the other operations on task in the worker job (`"/job:worker"`): ```python From 2548a3d2cf035a229d35ab6257bee511aa3a8e23 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Thu, 8 Mar 2018 00:15:22 +0800 Subject: [PATCH 0021/1734] fix some typo --- tensorflow/docs_src/programmers_guide/graphs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index 3b5e3e5a9a1..f28660d44a9 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -505,10 +505,10 @@ multiple graphs in the same process. As noted above, TensorFlow provides a "default graph" that is implicitly passed to all API functions in the same context. For many applications, a single graph is sufficient. However, TensorFlow also provides methods for manipulating -the default graph, which can be useful in more advanced used cases. For example: +the default graph, which can be useful in more advanced use cases. For example: * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each - operation in a single graph must have a unique name. TensorFlow will + operation in a single graph must have an unique name. TensorFlow will "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to their names if the requested name is already taken. Using multiple explicitly created graphs gives you more control over what name is given to each From cee41f9d10b81ce3b49f566ddd448a7f3f2872c3 Mon Sep 17 00:00:00 2001 From: KB Sriram Date: Wed, 7 Mar 2018 08:11:03 -0800 Subject: [PATCH 0022/1734] C++ gradient for StridedSlice See https://github.com/tensorflow/tensorflow/issues/9645 --- tensorflow/cc/gradients/array_grad.cc | 36 ++++++++++++++++++++++ tensorflow/cc/gradients/array_grad_test.cc | 24 +++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index 6545e4ee3eb..ff348fadb24 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad); +Status StridedSliceGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + Input x = Shape(scope, op.input(0)); + Input begin = op.input(1); + Input end = op.input(2); + Input strides = op.input(3); + int64 begin_mask; + int64 end_mask; + int64 ellipsis_mask; + int64 new_axis_mask; + int64 shrink_axis_mask; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask)); + TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask)); + grad_outputs->push_back( + StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0], + StridedSliceGrad::BeginMask(begin_mask) + .EndMask(end_mask) + .EllipsisMask(ellipsis_mask) + .NewAxisMask(new_axis_mask) + .ShrinkAxisMask(shrink_axis_mask))); + // No gradients returned for begin, end and strides + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper); + } // anonymous namespace } // namespace ops } // namespace tensorflow diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index 4a215fcc929..2a2180297ce 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) { RunTest(x, x_shape, y, y_shape); } +TEST_F(ArrayGradTest, StridedSliceGrad) { + TensorShape x_shape({6, 4, 4}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + + // y = x[2:6:2, 1:3, 1:3] + auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}); + // y.shape = [2, 2, 2]; + RunTest(x, x_shape, y, {2, 2, 2}); + + // y = x[2:6:2, 1:3, 1:3] + // begin_mask = 1<<1 (ignore begin_index = 1) + // end_mask = 1<<2 (ignore end_index = 2) + y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}, + StridedSlice::BeginMask(1<<1).EndMask(1<<2)); + // y.shape = [2, 3, 3]; + RunTest(x, x_shape, y, {2, 3, 3}); + + // y = [tf.newaxis, 2:6:2, 1:3, 1:3] + y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1}, + StridedSlice::NewAxisMask(1<<0)); + // y.shape = [1, 2, 2, 2]; + RunTest(x, x_shape, y, {1, 2, 2, 2}); +} + } // namespace } // namespace tensorflow From e31fb25f4e3989a846a8e54d789a3bf5efff0cea Mon Sep 17 00:00:00 2001 From: KB Sriram Date: Thu, 8 Mar 2018 07:40:24 -0800 Subject: [PATCH 0023/1734] Clang-format fixes. --- tensorflow/cc/gradients/array_grad_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index 2a2180297ce..de3bd0fc9e2 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -367,13 +367,13 @@ TEST_F(ArrayGradTest, StridedSliceGrad) { // begin_mask = 1<<1 (ignore begin_index = 1) // end_mask = 1<<2 (ignore end_index = 2) y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}, - StridedSlice::BeginMask(1<<1).EndMask(1<<2)); + StridedSlice::BeginMask(1 << 1).EndMask(1 << 2)); // y.shape = [2, 3, 3]; RunTest(x, x_shape, y, {2, 3, 3}); // y = [tf.newaxis, 2:6:2, 1:3, 1:3] y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1}, - StridedSlice::NewAxisMask(1<<0)); + StridedSlice::NewAxisMask(1 << 0)); // y.shape = [1, 2, 2, 2]; RunTest(x, x_shape, y, {1, 2, 2, 2}); } From d6533df7cd3ef19b39081a64fcb0bed5f83c7ee0 Mon Sep 17 00:00:00 2001 From: Giuseppe Date: Thu, 8 Mar 2018 17:49:29 +0100 Subject: [PATCH 0024/1734] Fix markdown error in layers tutorial. --- tensorflow/docs_src/tutorials/layers.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md index ee03f440c9b..b24d3f4cadc 100644 --- a/tensorflow/docs_src/tutorials/layers.md +++ b/tensorflow/docs_src/tutorials/layers.md @@ -192,8 +192,7 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how to calculate loss, configure the training op, and generate predictions. If you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s}, and find the above code intuitive, you may want to skim these sections or just -skip ahead to ["Training and Evaluating the CNN MNIST -Classifier"](#training-and-evaluating-the-cnn-mnist-classifier). +skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#training_and_evaluating_the_cnn_mnist_classifier). ### Input Layer @@ -534,9 +533,8 @@ if mode == tf.estimator.ModeKeys.TRAIN: ``` > Note: For a more in-depth look at configuring training ops for Estimator model -> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining -> the training op for the model"} in the @{$get_started/custom_estimators$"Creating Estimations in -> tf.estimator"} tutorial. +> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"} +> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial. ### Add evaluation metrics From fe46c22a80b068b2b30f1e44f2f950ba6b6e907b Mon Sep 17 00:00:00 2001 From: Joe Yearsley Date: Fri, 9 Mar 2018 22:41:37 +0000 Subject: [PATCH 0025/1734] Update fold_old_batch_norms.cc Fixes the problem of using fused batch normalization and this transform, only shows up when using 'NCHW' as the default is 'NHWC'. --- tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc index d86f65325be..a5acd53ad62 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc @@ -159,6 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector& scale_values, NodeDef bias_add_node; bias_add_node.set_op("BiasAdd"); bias_add_node.set_name(conv_output_name); + bias_add_op.attr["data_format"].CopyFrom(conv_node.attr["data_format"]) CopyNodeAttr(conv_node, "T", "T", &bias_add_node); AddNodeInput(conv_node.name(), &bias_add_node); AddNodeInput(bias_offset_node.name(), &bias_add_node); From 1ad788b136d509888cf7d484f762e31b2ee37a50 Mon Sep 17 00:00:00 2001 From: Joe Yearsley Date: Fri, 9 Mar 2018 22:46:30 +0000 Subject: [PATCH 0026/1734] Update fold_old_batch_norms.cc --- tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc index a5acd53ad62..3376a813120 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc @@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector& scale_values, NodeDef bias_add_node; bias_add_node.set_op("BiasAdd"); bias_add_node.set_name(conv_output_name); - bias_add_op.attr["data_format"].CopyFrom(conv_node.attr["data_format"]) + bias_add_node.attr["data_format"].CopyFrom(conv_node.attr["data_format"]) CopyNodeAttr(conv_node, "T", "T", &bias_add_node); AddNodeInput(conv_node.name(), &bias_add_node); AddNodeInput(bias_offset_node.name(), &bias_add_node); From d0680917907671f5870818d21ee0ff77bf7c3ff6 Mon Sep 17 00:00:00 2001 From: Joe Yearsley Date: Fri, 9 Mar 2018 23:56:52 +0000 Subject: [PATCH 0027/1734] Update fold_old_batch_norms.cc --- tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc index 3376a813120..59f3ffdcda4 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc @@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector& scale_values, NodeDef bias_add_node; bias_add_node.set_op("BiasAdd"); bias_add_node.set_name(conv_output_name); - bias_add_node.attr["data_format"].CopyFrom(conv_node.attr["data_format"]) + CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node); CopyNodeAttr(conv_node, "T", "T", &bias_add_node); AddNodeInput(conv_node.name(), &bias_add_node); AddNodeInput(bias_offset_node.name(), &bias_add_node); From b4db970c338123ee3156bb0e216193bde35d4b17 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Tue, 13 Mar 2018 00:04:33 +0800 Subject: [PATCH 0028/1734] fix broken link of tensor-like type --- tensorflow/docs_src/programmers_guide/graphs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index f28660d44a9..81fd99cb4a4 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -362,7 +362,7 @@ operations that are needed to compute the result. @{tf.Session.run} requires you to specify a list of **fetches**, which determine the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or -a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches +a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches determine what **subgraph** of the overall @{tf.Graph} must be executed to produce the result: this is the subgraph that contains all operations named in the fetch list, plus all operations whose outputs are used to compute the value From 1f03b013ef00c128cf8331f274524a23d86ac458 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Tue, 13 Mar 2018 16:44:57 +0800 Subject: [PATCH 0029/1734] revert wrong typo fix --- tensorflow/docs_src/programmers_guide/graphs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index 81fd99cb4a4..69eb6df5f6d 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -508,7 +508,7 @@ is sufficient. However, TensorFlow also provides methods for manipulating the default graph, which can be useful in more advanced use cases. For example: * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each - operation in a single graph must have an unique name. TensorFlow will + operation in a single graph must have a unique name. TensorFlow will "uniquify" the names of operations by appending `"_1"`, `"_2"`, and so on to their names if the requested name is already taken. Using multiple explicitly created graphs gives you more control over what name is given to each From d751b6bfa84dae1be9835fc40cc3094a8205a74e Mon Sep 17 00:00:00 2001 From: imsheridan Date: Tue, 13 Mar 2018 23:11:47 +0800 Subject: [PATCH 0030/1734] Fix link of typical distributed configuration --- tensorflow/docs_src/programmers_guide/graphs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index 69eb6df5f6d..e4095cf7dd9 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -210,7 +210,7 @@ with tf.device("/device:GPU:0"): # Operations created in this context will be pinned to the GPU. result = tf.matmul(weights, img) ``` -If you are deploying TensorFlow in a typical @{$deploy/distributed} configuration, +If you are deploying TensorFlow in a @{$distributed$typical distributed configuration}, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on task in the worker job (`"/job:worker"`): From b618740a8754e85a2a6ee142028105f76a4d5d58 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Fri, 16 Mar 2018 00:11:38 +0900 Subject: [PATCH 0031/1734] implement matrix 2-norm --- tensorflow/python/ops/linalg_ops.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index 37470e00d7f..110b766a6e9 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -454,7 +454,7 @@ def norm(tensor, This function can compute several different vector norms (the 1-norm, the Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and - matrix norms (Frobenius, 1-norm, and inf-norm). + matrix norms (Frobenius, 1-norm, 2-norm and inf-norm). Args: tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128` @@ -465,7 +465,7 @@ def norm(tensor, Some restrictions apply: a) The Frobenius norm `fro` is not defined for vectors, b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`, - `np.inf` are supported. + `2`, `np.inf` are supported. See the description of `axis` on how to compute norms for a batch of vectors or matrices stored in a tensor. axis: If `axis` is `None` (the default), the input is considered a vector @@ -521,8 +521,7 @@ def norm(tensor, axis[0] == axis[1]): raise ValueError( "'axis' must be None, an integer, or a tuple of 2 unique integers") - # TODO(rmlarsen): Implement matrix 2-norm using tf.svd(). - supported_matrix_norms = ['euclidean', 'fro', 1, np.inf] + supported_matrix_norms = ['euclidean', 'fro', 1, 2, np.inf] if ord not in supported_matrix_norms: raise ValueError("'ord' must be a supported matrix norm in %s, got %s" % (supported_matrix_norms, ord)) @@ -539,10 +538,20 @@ def norm(tensor, with ops.name_scope(name, 'norm', [tensor]): tensor = ops.convert_to_tensor(tensor) + rank = len(tensor.get_shape().as_list()) + axis = tuple(map(lambda i: i if i >= 0 else i + rank, axis)) + if ord in ['fro', 'euclidean', 2, 2.0]: - # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for - # matrices. - result = math_ops.sqrt( + if is_matrix_norm and ord in [2, 2.0]: + axes = list(range(rank)) + perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis) + perm_after = list(map(lambda i: perm_before.index(i), axes)) + result = array_ops.transpose(array_ops.expand_dims(math_ops.reduce_max( + gen_linalg_ops.svd(array_ops.transpose(tensor, perm=perm_before), + compute_uv=False)[0], axis=-1, keepdims=True), + axis=-1), perm=perm_after) + else: + result = math_ops.sqrt( math_ops.reduce_sum( tensor * math_ops.conj(tensor), axis, keepdims=True)) else: From a280a1d0cfd64831857826db639a3ee0180094de Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Fri, 16 Mar 2018 00:32:34 +0900 Subject: [PATCH 0032/1734] follow python coding style --- tensorflow/python/ops/linalg_ops.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index 110b766a6e9..b467711e3bb 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -546,14 +546,15 @@ def norm(tensor, axes = list(range(rank)) perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis) perm_after = list(map(lambda i: perm_before.index(i), axes)) - result = array_ops.transpose(array_ops.expand_dims(math_ops.reduce_max( - gen_linalg_ops.svd(array_ops.transpose(tensor, perm=perm_before), - compute_uv=False)[0], axis=-1, keepdims=True), - axis=-1), perm=perm_after) + result = array_ops.transpose(array_ops.expand_dims( + math_ops.reduce_max(gen_linalg_ops.svd( + array_ops.transpose(tensor, perm=perm_before), + compute_uv=False)[0], axis=-1, keepdims=True), axis=-1), + perm=perm_after) else: result = math_ops.sqrt( - math_ops.reduce_sum( - tensor * math_ops.conj(tensor), axis, keepdims=True)) + math_ops.reduce_sum( + tensor * math_ops.conj(tensor), axis, keepdims=True)) else: result = math_ops.abs(tensor) if ord == 1: From cc10ac9b7d593375a7cee0c167c20989dc29e8cf Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Fri, 16 Mar 2018 00:40:05 +0900 Subject: [PATCH 0033/1734] remove unnecessary lambda --- tensorflow/python/ops/linalg_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index b467711e3bb..db6ce71125b 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -545,7 +545,7 @@ def norm(tensor, if is_matrix_norm and ord in [2, 2.0]: axes = list(range(rank)) perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis) - perm_after = list(map(lambda i: perm_before.index(i), axes)) + perm_after = list(map(perm_before.index, axes)) result = array_ops.transpose(array_ops.expand_dims( math_ops.reduce_max(gen_linalg_ops.svd( array_ops.transpose(tensor, perm=perm_before), From b21ceeb518ca9462a247d8be05870f12bebad201 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 15 Mar 2018 23:13:25 -0700 Subject: [PATCH 0034/1734] Enhancement with deprecated_argument_lookup for argmax This fix makes some enhancement for argmax, using deprecated_argument_lookup instread of customerized logic. Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index e18d0e95015..9a88b713982 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -208,11 +208,9 @@ def argmax(input, name=None, dimension=None, output_type=dtypes.int64): - if dimension is not None: - if axis is not None: - raise ValueError("Cannot specify both 'axis' and 'dimension'") - axis = dimension - elif axis is None: + axis = deprecation.deprecated_argument_lookup( + "axis", axis, "dimension", dimension) + if axis is None: axis = 0 return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type) From 82571ca199869f60fe2036d15d0071031d997b47 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 15 Mar 2018 23:15:37 -0700 Subject: [PATCH 0035/1734] Enhancement with deprecated_argument_lookup for argmin This fix makes some enhancement for argmin, using deprecated_argument_lookup instread of customerized logic. Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 9a88b713982..a2892d206d1 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -226,11 +226,9 @@ def argmin(input, name=None, dimension=None, output_type=dtypes.int64): - if dimension is not None: - if axis is not None: - raise ValueError("Cannot specify both 'axis' and 'dimension'") - axis = dimension - elif axis is None: + axis = deprecation.deprecated_argument_lookup( + "axis", axis, "dimension", dimension) + if axis is None: axis = 0 return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type) From 52fef7f6b8b41d4fffa92bddcb78d96eb6333051 Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Fri, 16 Mar 2018 16:03:26 +0900 Subject: [PATCH 0036/1734] fix typo --- tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc index 272410c693a..7651a03fe51 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc @@ -398,7 +398,7 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNorms) { } TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithConcat) { - // Test axis is not 3, so all weigths and offsets are fused to each of inputs + // Test axis is not 3, so all weights and offsets are fused to each of inputs // of conv2d. TestFoldFusedBatchNormsWithConcat(/*split=*/true); // Test axis = 3, BatchNorm weights and offsets will be split before fused From 20424e92417b520d7ea8c7323eee46538d2b909f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 17 Mar 2018 09:30:24 +0800 Subject: [PATCH 0037/1734] CLN: remove the unused import: tf_export --- tensorflow/contrib/opt/python/training/adamax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index fe5522a1708..65918831e92 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -26,7 +26,6 @@ from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import state_ops from tensorflow.python.training import adam from tensorflow.python.training import training_ops -from tensorflow.python.util.tf_export import tf_export class AdaMaxOptimizer(adam.AdamOptimizer): From b5ebb7e9e5f5ae59e6db93bb5950f4bb68bf9e18 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Sun, 18 Mar 2018 00:48:46 +0900 Subject: [PATCH 0038/1734] update norm_op_test --- tensorflow/python/kernel_tests/norm_op_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py index d85512fae69..d6625b69ef7 100644 --- a/tensorflow/python/kernel_tests/norm_op_test.py +++ b/tensorflow/python/kernel_tests/norm_op_test.py @@ -85,8 +85,6 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_): if ((not is_matrix_norm and ord_ == "fro") or (is_matrix_norm and is_fancy_p_norm)): self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm") - if is_matrix_norm and ord_ == 2: - self.skipTest("Not supported by tf.norm") if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2): self.skipTest("Not supported by numpy.linalg.norm") matrix = np.random.randn(*shape_).astype(dtype_) From c53160a2a5decdae30bda6e8f40b45f3b4dd9f8e Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Sun, 18 Mar 2018 00:49:13 +0900 Subject: [PATCH 0039/1734] use tf function instead of np --- tensorflow/python/ops/linalg_ops.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index db6ce71125b..d8150d85b93 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import functional_ops from tensorflow.python.ops import gen_linalg_ops from tensorflow.python.ops import math_ops # pylint: disable=wildcard-import @@ -538,19 +539,27 @@ def norm(tensor, with ops.name_scope(name, 'norm', [tensor]): tensor = ops.convert_to_tensor(tensor) - rank = len(tensor.get_shape().as_list()) - axis = tuple(map(lambda i: i if i >= 0 else i + rank, axis)) if ord in ['fro', 'euclidean', 2, 2.0]: if is_matrix_norm and ord in [2, 2.0]: - axes = list(range(rank)) - perm_before = list(filter(lambda i: i not in axis, axes)) + list(axis) - perm_after = list(map(perm_before.index, axes)) - result = array_ops.transpose(array_ops.expand_dims( - math_ops.reduce_max(gen_linalg_ops.svd( - array_ops.transpose(tensor, perm=perm_before), - compute_uv=False)[0], axis=-1, keepdims=True), axis=-1), - perm=perm_after) + rank = array_ops.rank(tensor) + axis = functional_ops.map_fn( + lambda i: control_flow_ops.cond(i >= 0, lambda: i, + lambda: i + rank), + ops.convert_to_tensor(axis)).eval() + axes = math_ops.range(rank) + perm_before = array_ops.concat( + [array_ops.setdiff1d(axes, axis)[0], axis], axis=0) + perm_after = functional_ops.map_fn( + lambda i: math_ops.cast( + array_ops.squeeze( + array_ops.where(math_ops.equal(perm_before, i))), + dtype=dtypes.int32), axes) + permed = array_ops.transpose(tensor, perm=perm_before) + matrix_2_norm = array_ops.expand_dims( + math_ops.reduce_max(gen_linalg_ops.svd(permed, compute_uv=False)[0], + axis=-1, keepdims=True), axis=-1) + result = array_ops.transpose(matrix_2_norm, perm=perm_after) else: result = math_ops.sqrt( math_ops.reduce_sum( From fda633fb7187da8522ef79555d1267996fa983bc Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Sun, 18 Mar 2018 21:29:16 +0900 Subject: [PATCH 0040/1734] remove test code --- tensorflow/python/ops/linalg_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index d8150d85b93..608b72c574a 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -546,7 +546,7 @@ def norm(tensor, axis = functional_ops.map_fn( lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank), - ops.convert_to_tensor(axis)).eval() + ops.convert_to_tensor(axis)) axes = math_ops.range(rank) perm_before = array_ops.concat( [array_ops.setdiff1d(axes, axis)[0], axis], axis=0) From 1da3a47287aa911287d6667dd837dc2a7ddaa8f1 Mon Sep 17 00:00:00 2001 From: Smit Shilu Date: Thu, 22 Mar 2018 10:58:51 -0400 Subject: [PATCH 0041/1734] Update BUILD exports_files(["LICENSE"]) gives error while building on Mac and Ubuntu --- tensorflow/contrib/lite/BUILD | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD index dafe6f136ef..1c5bc29763d 100644 --- a/tensorflow/contrib/lite/BUILD +++ b/tensorflow/contrib/lite/BUILD @@ -6,8 +6,6 @@ licenses(["notice"]) # Apache 2.0 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops") -exports_files(["LICENSE"]) - exports_files(glob([ "testdata/*.bin", "testdata/*.pb", From 07502453382cc007f42818118a592220a8c7d849 Mon Sep 17 00:00:00 2001 From: "wenhao.hu" Date: Wed, 28 Mar 2018 10:25:47 +0900 Subject: [PATCH 0042/1734] clean the pollution of axis --- tensorflow/python/ops/linalg_ops.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index 608b72c574a..86be1e7752d 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -543,13 +543,12 @@ def norm(tensor, if ord in ['fro', 'euclidean', 2, 2.0]: if is_matrix_norm and ord in [2, 2.0]: rank = array_ops.rank(tensor) - axis = functional_ops.map_fn( - lambda i: control_flow_ops.cond(i >= 0, lambda: i, - lambda: i + rank), + positive_axis = functional_ops.map_fn( + lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank), ops.convert_to_tensor(axis)) axes = math_ops.range(rank) perm_before = array_ops.concat( - [array_ops.setdiff1d(axes, axis)[0], axis], axis=0) + [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], axis=0) perm_after = functional_ops.map_fn( lambda i: math_ops.cast( array_ops.squeeze( @@ -557,8 +556,11 @@ def norm(tensor, dtype=dtypes.int32), axes) permed = array_ops.transpose(tensor, perm=perm_before) matrix_2_norm = array_ops.expand_dims( - math_ops.reduce_max(gen_linalg_ops.svd(permed, compute_uv=False)[0], - axis=-1, keepdims=True), axis=-1) + math_ops.reduce_max( + gen_linalg_ops.svd(permed, compute_uv=False)[0], + axis=-1, + keepdims=True), + axis=-1) result = array_ops.transpose(matrix_2_norm, perm=perm_after) else: result = math_ops.sqrt( From e9ea69058974d9155851c6325362dc3cb188cefb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 28 Mar 2018 10:22:31 +0800 Subject: [PATCH 0043/1734] CLN: remove no_oss, notsan tags --- tensorflow/contrib/opt/BUILD | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index a86d150f7a0..aaf00128081 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -53,10 +53,6 @@ py_test( name = "adamax_test", srcs = ["python/training/adamax_test.py"], srcs_version = "PY2AND3", - tags = [ - "no_oss", # b/73507407 - "notsan", # b/31055119 - ], deps = [ ":opt_py", "//tensorflow/python:array_ops", From 3a9d5e51bbb7f205a74cbfe5e6bae953d4fc2149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Wed, 28 Mar 2018 10:28:21 +0800 Subject: [PATCH 0044/1734] CLN: add comment for variable --- tensorflow/contrib/opt/python/training/adamax.py | 2 +- tensorflow/core/kernels/training_ops.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index 65918831e92..403fdaa637b 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -48,7 +48,7 @@ class AdaMaxOptimizer(adam.AdamOptimizer): ``` The update rule for `variable` with gradient `g` uses an optimization - described at the end of section7.1 of the paper: + described at the end of section 7.1 of the paper: ``` t <- t + 1 diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 0387e3011ea..45c600fd40a 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -343,9 +343,9 @@ struct ApplyAdaMaxNonCuda { LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it."; } m.device(d) += (grad - m) * (T(1) - beta1()); - // v == u + // Here v is u in section 7.1 v.device(d) = (beta2() * v).cwiseMax(grad.abs()); - // var == θ + // var is θ in section 7.1 var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon())); } }; From c15dbc39505de93770fd89cab4f4ae9a2a72b4e1 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Thu, 29 Mar 2018 02:33:24 +0900 Subject: [PATCH 0045/1734] fix test --- tensorflow/python/kernel_tests/norm_op_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py index d6625b69ef7..0e7d4fd9b98 100644 --- a/tensorflow/python/kernel_tests/norm_op_test.py +++ b/tensorflow/python/kernel_tests/norm_op_test.py @@ -37,17 +37,17 @@ class NormOpTest(test_lib.TestCase): def testBadOrder(self): matrix = [[0., 1.], [2., 3.]] - for ord_ in "foo", -7, -1.1, 0: + for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): - linalg_ops.norm(matrix, ord="fro") + linalg_ops.norm(matrix, ord=ord_) - for ord_ in "foo", -7, -1.1, 0: + for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): linalg_ops.norm(matrix, ord=ord_, axis=-1) - for ord_ in 1.1, 2: + for ord_ in "foo", -7, -1.1, 1.1: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported matrix norm"): linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1]) From ab4efde7162445f20c73bdd3419811ab9c324a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Thu, 29 Mar 2018 06:48:19 +0800 Subject: [PATCH 0046/1734] DOC: explain difference between adamax and adam --- tensorflow/contrib/opt/python/training/adamax.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index 403fdaa637b..ea08a0931b2 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -31,7 +31,8 @@ from tensorflow.python.training import training_ops class AdaMaxOptimizer(adam.AdamOptimizer): """Optimizer that implements the AdaMax algorithm. - See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) + Adamax is sometimes superior to adam, specially in models with embeddings, + see [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). """ From ab3b1705bc2c546eb3607876fcdcc45902552346 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Sat, 31 Mar 2018 00:36:25 +0900 Subject: [PATCH 0047/1734] cast svd output to float32 and use keepdims in test cases --- tensorflow/python/kernel_tests/norm_op_test.py | 4 ++-- tensorflow/python/ops/linalg_ops.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py index 0e7d4fd9b98..dde28007d46 100644 --- a/tensorflow/python/kernel_tests/norm_op_test.py +++ b/tensorflow/python/kernel_tests/norm_op_test.py @@ -69,12 +69,12 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_): if use_static_shape_: tf_matrix = constant_op.constant(matrix) tf_norm = linalg_ops.norm( - tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_) + tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = sess.run(tf_norm) else: tf_matrix = array_ops.placeholder(dtype_) tf_norm = linalg_ops.norm( - tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_) + tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix}) self.assertAllClose(np_norm, tf_norm_val) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index 86be1e7752d..bbc39f58db5 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -548,7 +548,8 @@ def norm(tensor, ops.convert_to_tensor(axis)) axes = math_ops.range(rank) perm_before = array_ops.concat( - [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], axis=0) + [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], + axis=0) perm_after = functional_ops.map_fn( lambda i: math_ops.cast( array_ops.squeeze( @@ -557,7 +558,9 @@ def norm(tensor, permed = array_ops.transpose(tensor, perm=perm_before) matrix_2_norm = array_ops.expand_dims( math_ops.reduce_max( - gen_linalg_ops.svd(permed, compute_uv=False)[0], + math_ops.cast( + gen_linalg_ops.svd(permed, compute_uv=False)[0], + dtype=dtypes.float32), axis=-1, keepdims=True), axis=-1) From 6b1d9e788305c41cf436a1873c59df8d0df87d44 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Sat, 31 Mar 2018 01:27:05 +0900 Subject: [PATCH 0048/1734] use abs instead of cast --- tensorflow/python/ops/linalg_ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index bbc39f58db5..b306042aff6 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -558,9 +558,7 @@ def norm(tensor, permed = array_ops.transpose(tensor, perm=perm_before) matrix_2_norm = array_ops.expand_dims( math_ops.reduce_max( - math_ops.cast( - gen_linalg_ops.svd(permed, compute_uv=False)[0], - dtype=dtypes.float32), + math_ops.abs(gen_linalg_ops.svd(permed, compute_uv=False)[0]), axis=-1, keepdims=True), axis=-1) From 0c6845db28bd690eb848dde837f23fef6a0a8eed Mon Sep 17 00:00:00 2001 From: josephyearsley Date: Sat, 31 Mar 2018 17:40:40 +0100 Subject: [PATCH 0049/1734] Copy data_format if the original node has that attr. --- tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc index 59f3ffdcda4..988ba25e366 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc @@ -159,7 +159,9 @@ Status FuseScaleOffsetToConvWeights(const std::vector& scale_values, NodeDef bias_add_node; bias_add_node.set_op("BiasAdd"); bias_add_node.set_name(conv_output_name); - CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node); + if (HasAttr(conv_node, "data_format")) { + CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node); + } CopyNodeAttr(conv_node, "T", "T", &bias_add_node); AddNodeInput(conv_node.name(), &bias_add_node); AddNodeInput(bias_offset_node.name(), &bias_add_node); From 3bf08422a2cdd732e9b00debe3d217d04473902d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 1 Apr 2018 09:56:48 +0800 Subject: [PATCH 0050/1734] CLN: remove use_nesterov argument --- .../base_api/api_def_ApplyAdaMax.pbtxt | 6 - .../api_def_ResourceApplyAdaMax.pbtxt | 6 - tensorflow/core/kernels/training_ops.cc | 204 +++++++++++------- tensorflow/core/kernels/training_ops.h | 2 +- .../core/kernels/training_ops_gpu.cu.cc | 2 +- tensorflow/core/ops/training_ops.cc | 2 - 6 files changed, 133 insertions(+), 89 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt index 106c30ca83a..57938b42ae5 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt @@ -72,12 +72,6 @@ END If `True`, updating of the var, m, and v tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. -END - } - attr { - name: "use_nesterov" - description: <::ConstScalar beta1, typename TTypes::ConstScalar beta2, typename TTypes::ConstScalar epsilon, - typename TTypes::ConstFlat grad, bool use_nesterov) { - if (use_nesterov) { - LOG(WARNING) << "AdaMax doesn't support use_nesterov yet, ignore it."; - } + typename TTypes::ConstFlat grad) { m.device(d) += (grad - m) * (T(1) - beta1()); // Here v is u in section 7.1 v.device(d) = (beta2() * v).cwiseMax(grad.abs()); @@ -350,20 +347,6 @@ struct ApplyAdaMaxNonCuda { } }; -#ifdef TENSORFLOW_USE_SYCL -template -struct ApplyAdaMaxSYCL { - void operator()(const SYCLDevice& d, typename TTypes::Flat var, - typename TTypes::Flat m, typename TTypes::Flat v, - T beta1_power, T beta2_power, T lr, T beta1, T beta2, - T epsilon, typename TTypes::ConstFlat grad) { - m.device(d) += (grad - m) * (T(1) - beta1); - v.device(d) = (beta2 * v).cwiseMax(grad.abs()); - var.device(d) -= lr / (T(1) - beta1_power) * (m / (v + epsilon)); - } -}; -#endif // TENSORFLOW_USE_SYCL - template struct ApplyAdaMax : ApplyAdaMaxNonCuda {}; @@ -2516,12 +2499,10 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS -template - class Functor> -class ApplyAdamBaseOp : public OpKernel { +template +class ApplyAdamOp : public OpKernel { public: - explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_)); } @@ -2594,11 +2575,11 @@ class ApplyAdamBaseOp : public OpKernel { grad.shape().DebugString())); const Device& device = ctx->template eigen_device(); - auto functor = Functor(); - functor(device, var.flat(), m.flat(), v.flat(), - beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), - beta1.scalar(), beta2.scalar(), epsilon.scalar(), - grad.flat(), use_nesterov_); + functor::ApplyAdam()( + device, var.flat(), m.flat(), v.flat(), + beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + grad.flat(), use_nesterov_); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } @@ -2609,11 +2590,10 @@ class ApplyAdamBaseOp : public OpKernel { }; #ifdef TENSORFLOW_USE_SYCL -template class Functor> -class ApplyAdamBaseOp : public OpKernel { +template +class ApplyAdamOp : public OpKernel { public: - explicit ApplyAdamBaseOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); } @@ -2714,10 +2694,9 @@ class ApplyAdamBaseOp : public OpKernel { var.shape().DebugString(), " ", grad.shape().DebugString())); - auto functor = Functor(); - functor(device, var.flat(), m.flat(), v.flat(), - beta1_power, beta2_power, lr, beta1, beta2, - epsilon, grad.flat()); + functor::ApplyAdamSYCL()(device, var.flat(), m.flat(), v.flat(), + beta1_power, beta2_power, lr, beta1, beta2, + epsilon, grad.flat()); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } @@ -2727,28 +2706,28 @@ class ApplyAdamBaseOp : public OpKernel { }; #endif // TENSORFLOW_USE_SYCL -#define REGISTER_KERNELS(D, T, F) \ +#define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyAdamBaseOp); \ + ApplyAdamOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdam") \ .HostMemory("var") \ .HostMemory("m") \ .HostMemory("v") \ .Device(DEVICE_##D) \ .TypeConstraint("T"), \ - ApplyAdamBaseOp); -#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdam); + ApplyAdamOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); + TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); -#undef REGISTER_CPU_KERNELS #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdamSYCL); +#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T); + TF_CALL_float(REGISTER_SYCL_KERNELS); TF_CALL_double(REGISTER_SYCL_KERNELS); -#undef REGISTER_SYCL_KERNELS #endif #if GOOGLE_CUDA @@ -2773,44 +2752,124 @@ DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor -#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdam); -REGISTER_GPU_KERNELS(Eigen::half); -REGISTER_GPU_KERNELS(float); -REGISTER_GPU_KERNELS(double); -#undef REGISTER_GPU_KERNELS +REGISTER_KERNELS(GPU, Eigen::half); +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS -#define REGISTER_KERNELS(D, T, F) \ - REGISTER_KERNEL_BUILDER( \ +template +class ApplyAdaMaxOp : public OpKernel { + public: + explicit ApplyAdaMaxOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, + {0, 1, 2}); + + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 0, use_exclusive_lock_, false, &var)); + Tensor m; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 1, use_exclusive_lock_, false, &m)); + Tensor v; + OP_REQUIRES_OK(ctx, GetInputTensorFromVariable( + ctx, 2, use_exclusive_lock_, false, &v)); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(0))); + OP_REQUIRES( + ctx, m.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(1))); + OP_REQUIRES( + ctx, v.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", requested_input(2))); + + const Tensor& beta1_power = ctx->input(3); + const Tensor& beta2_power = ctx->input(4); + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()), + errors::InvalidArgument("beta2_power is not a scalar: ", + beta2_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar : ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + + const Tensor& grad = ctx->input(9); + OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()), + errors::InvalidArgument("var and m do not have the same shape", + var.shape().DebugString(), " ", + m.shape().DebugString())); + OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()), + errors::InvalidArgument("var and v do not have the same shape", + var.shape().DebugString(), " ", + v.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + + const Device& device = ctx->template eigen_device(); + functor::ApplyAdaMax()( + device, var.flat(), m.flat(), v.flat(), + beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), + beta1.scalar(), beta2.scalar(), epsilon.scalar(), + grad.flat()); + + MaybeForwardRefInputToRefOutput(ctx, 0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ Name("ApplyAdaMax").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyAdamBaseOp); \ + ApplyAdaMaxOp); \ REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdaMax") \ - .HostMemory("var") \ - .HostMemory("m") \ - .HostMemory("v") \ - .Device(DEVICE_##D) \ - .TypeConstraint("T"), \ - ApplyAdamBaseOp); -#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T, functor::ApplyAdaMax); + .HostMemory("var") \ + .HostMemory("m") \ + .HostMemory("v") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyAdaMaxOp); +#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); + TF_CALL_half(REGISTER_CPU_KERNELS); TF_CALL_float(REGISTER_CPU_KERNELS); TF_CALL_double(REGISTER_CPU_KERNELS); -#undef REGISTER_CPU_KERNELS - -#ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T, functor::ApplyAdaMaxSYCL); -TF_CALL_float(REGISTER_SYCL_KERNELS); -TF_CALL_double(REGISTER_SYCL_KERNELS); -#undef REGISTER_SYCL_KERNELS -#endif #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ - void ApplyAdaMax::operator()( \ + void ApplyAdaMax::operator()( \ const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::Flat m, typename TTypes::Flat v, \ typename TTypes::ConstScalar beta1_power, \ @@ -2819,7 +2878,7 @@ namespace functor { typename TTypes::ConstScalar beta1, \ typename TTypes::ConstScalar beta2, \ typename TTypes::ConstScalar epsilon, \ - typename TTypes::ConstFlat grad, bool use_nesterov); \ + typename TTypes::ConstFlat grad); \ extern template struct ApplyAdaMax; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); @@ -2827,12 +2886,11 @@ DECLARE_GPU_SPEC(double); #undef DECLARE_GPU_SPEC } // namespace functor -#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T, functor::ApplyAdaMax); -REGISTER_GPU_KERNELS(Eigen::half); -REGISTER_GPU_KERNELS(float); -REGISTER_GPU_KERNELS(double); -#undef REGISTER_GPU_KERNELS +REGISTER_KERNELS(GPU, Eigen::half); +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); #endif +#undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS template diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index 46a52902108..74acc12d502 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -149,7 +149,7 @@ struct ApplyAdaMax { typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, typename TTypes::ConstScalar epsilon, - typename TTypes::ConstFlat grad, bool use_nesterov); + typename TTypes::ConstFlat grad); }; template diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index 54c06b130ce..1a6fc264227 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -152,7 +152,7 @@ struct ApplyAdaMax { typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, typename TTypes::ConstScalar epsilon, - typename TTypes::ConstFlat grad, bool use_nesterov) { + typename TTypes::ConstFlat grad) { Eigen::array::Tensor::Index, 1> bcast; bcast[0] = grad.dimension(0); Eigen::Sizes<1> single; diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index 6f107db3eac..99176cec551 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -751,7 +751,6 @@ REGISTER_OP("ApplyAdaMax") .Output("out: Ref(T)") .Attr("T: numbertype") .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") .SetShapeFn([](InferenceContext* c) { return ApplyAdamShapeFn(c, false /* sparse */); }); @@ -769,7 +768,6 @@ REGISTER_OP("ResourceApplyAdaMax") .Input("grad: T") .Attr("T: numbertype") .Attr("use_locking: bool = false") - .Attr("use_nesterov: bool = false") .SetShapeFn([](InferenceContext* c) { return ApplyAdamShapeFn(c, false /* sparse */); }); From f4850641530017a3b2b294974298ae13028b8583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 1 Apr 2018 10:21:46 +0800 Subject: [PATCH 0051/1734] CLN: code style --- tensorflow/core/kernels/training_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 1a8d08288b0..aedca80c317 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -342,7 +342,7 @@ struct ApplyAdaMaxNonCuda { m.device(d) += (grad - m) * (T(1) - beta1()); // Here v is u in section 7.1 v.device(d) = (beta2() * v).cwiseMax(grad.abs()); - // var is θ in section 7.1 + // var is θ in section 7.1 var.device(d) -= lr() / (T(1) - beta1_power()) * (m / (v + epsilon())); } }; From 0d343fbb0e8c66622bc21aab39e225c6d895a78b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sun, 1 Apr 2018 10:42:10 +0800 Subject: [PATCH 0052/1734] CLN: remove unused argument beta2_power --- .../contrib/opt/python/training/adamax.py | 42 ++++++++++++++++--- .../opt/python/training/adamax_test.py | 17 +++----- .../base_api/api_def_ApplyAdaMax.pbtxt | 6 --- .../api_def_ResourceApplyAdaMax.pbtxt | 6 --- tensorflow/core/kernels/training_ops.cc | 18 +++----- tensorflow/core/kernels/training_ops.h | 1 - .../core/kernels/training_ops_gpu.cu.cc | 1 - tensorflow/core/ops/training_ops.cc | 24 +++++++++-- 8 files changed, 67 insertions(+), 48 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index ea08a0931b2..ba9e79be99b 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -85,14 +86,35 @@ class AdaMaxOptimizer(adam.AdamOptimizer): super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2, epsilon, use_locking, name) + def _get_beta_accumulators(self): + if context.in_graph_mode(): + graph = ops.get_default_graph() + else: + graph = None + return self._get_non_slot_variable("beta1_power", graph=graph) + + def _create_slots(self, var_list): + # Create the beta1 accumulators on the same device as the first + # variable. Sort the var_list to make sure this device is consistent across + # workers (these need to go on the same PS, otherwise some updates are + # silently ignored). + first_var = min(var_list, key=lambda x: x.name) + self._create_non_slot_variable(initial_value=self._beta1, + name="beta1_power", + colocate_with=first_var) + + # Create slots for the first and second moments. + for v in var_list: + self._zeros_slot(v, "m", self._name) + self._zeros_slot(v, "v", self._name) + def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") - beta1_power, beta2_power = self._get_beta_accumulators() + beta1_power = self._get_beta_accumulators() return training_ops.apply_ada_max( var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), - math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), @@ -102,11 +124,10 @@ class AdaMaxOptimizer(adam.AdamOptimizer): def _resource_apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") - beta1_power, beta2_power = self._get_beta_accumulators() + beta1_power = self._get_beta_accumulators() return training_ops.resource_apply_ada_max( var.handle, m.handle, v.handle, math_ops.cast(beta1_power, grad.dtype.base_dtype), - math_ops.cast(beta2_power, grad.dtype.base_dtype), math_ops.cast(self._lr_t, grad.dtype.base_dtype), math_ops.cast(self._beta1_t, grad.dtype.base_dtype), math_ops.cast(self._beta2_t, grad.dtype.base_dtype), @@ -115,9 +136,8 @@ class AdaMaxOptimizer(adam.AdamOptimizer): def _apply_sparse_shared(self, grad, var, indices, scatter_add, scatter_update): - beta1_power, beta2_power = self._get_beta_accumulators() + beta1_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) - beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) @@ -159,3 +179,13 @@ class AdaMaxOptimizer(adam.AdamOptimizer): return self._apply_sparse_shared( grad, var, indices, self._resource_scatter_add, self._resource_scatter_update) + + def _finish(self, update_ops, name_scope): + # Update the power accumulators. + with ops.control_dependencies(update_ops): + beta1_power = self._get_beta_accumulators() + with ops.colocate_with(beta1_power): + update_beta1 = beta1_power.assign( + beta1_power * self._beta1_t, use_locking=self._use_locking) + return control_flow_ops.group(*update_ops + [update_beta1], + name=name_scope) diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py index e91e5cb96a5..ccd08c09341 100644 --- a/tensorflow/contrib/opt/python/training/adamax_test.py +++ b/tensorflow/contrib/opt/python/training/adamax_test.py @@ -105,12 +105,11 @@ class AdaMaxOptimizerTest(test.TestCase): self.assertAllClose([1.0, 2.0, 3.0], var0.eval()) self.assertAllClose([4.0, 5.0, 6.0], var1.eval()) - beta1_power, beta2_power = opt._get_beta_accumulators() + beta1_power = opt._get_beta_accumulators() # Run 3 steps of AdaMax for t in range(1, 4): self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) update.run() var0_np, m0, v0 = adamax_sparse_update_numpy( @@ -195,11 +194,9 @@ class AdaMaxOptimizerTest(test.TestCase): opt = adamax.AdaMaxOptimizer() update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) opt_variables = opt.variables() - beta1_power, beta2_power = opt._get_beta_accumulators() + beta1_power = opt._get_beta_accumulators() self.assertTrue(beta1_power is not None) - self.assertTrue(beta2_power is not None) self.assertIn(beta1_power, opt_variables) - self.assertIn(beta2_power, opt_variables) with ops.Graph().as_default(): # Shouldn't return non-slot variables from other graphs. @@ -211,7 +208,7 @@ class AdaMaxOptimizerTest(test.TestCase): self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) - beta1_power, beta2_power = opt._get_beta_accumulators() + beta1_power = opt._get_beta_accumulators() # Run 3 steps of AdaMax for t in range(1, 4): @@ -222,8 +219,6 @@ class AdaMaxOptimizerTest(test.TestCase): self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta1_power)) - self.assertAllCloseAccordingToType(0.999**(t + 1), - self.evaluate(beta2_power)) var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) @@ -265,12 +260,11 @@ class AdaMaxOptimizerTest(test.TestCase): self.assertAllClose([1.0, 2.0], var0.eval()) self.assertAllClose([3.0, 4.0], var1.eval()) - beta1_power, beta2_power = opt._get_beta_accumulators() + beta1_power = opt._get_beta_accumulators() # Run 3 steps of AdaMax for t in range(1, 4): self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) update.run() var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) @@ -299,7 +293,7 @@ class AdaMaxOptimizerTest(test.TestCase): update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - beta1_power, beta2_power = opt._get_beta_accumulators() + beta1_power = opt._get_beta_accumulators() # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], var0.eval()) @@ -308,7 +302,6 @@ class AdaMaxOptimizerTest(test.TestCase): # Run 3 steps of intertwined AdaMax1 and AdaMax2. for t in range(1, 4): self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) if t % 2 == 0: update1.run() else: diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt index 57938b42ae5..5e705c009c6 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt @@ -22,12 +22,6 @@ END name: "beta1_power" description: <::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, typename TTypes::ConstScalar beta1_power, - typename TTypes::ConstScalar beta2_power, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, @@ -2793,18 +2792,14 @@ class ApplyAdaMaxOp : public OpKernel { "Attempting to use uninitialized variables: ", requested_input(2))); const Tensor& beta1_power = ctx->input(3); - const Tensor& beta2_power = ctx->input(4); - const Tensor& lr = ctx->input(5); - const Tensor& beta1 = ctx->input(6); - const Tensor& beta2 = ctx->input(7); - const Tensor& epsilon = ctx->input(8); + const Tensor& lr = ctx->input(4); + const Tensor& beta1 = ctx->input(5); + const Tensor& beta2 = ctx->input(6); + const Tensor& epsilon = ctx->input(7); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()), errors::InvalidArgument("beta1_power is not a scalar: ", beta1_power.shape().DebugString())); - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()), - errors::InvalidArgument("beta2_power is not a scalar: ", - beta2_power.shape().DebugString())); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), errors::InvalidArgument("lr is not a scalar : ", lr.shape().DebugString())); @@ -2818,7 +2813,7 @@ class ApplyAdaMaxOp : public OpKernel { errors::InvalidArgument("epsilon is not a scalar: ", epsilon.shape().DebugString())); - const Tensor& grad = ctx->input(9); + const Tensor& grad = ctx->input(8); OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()), errors::InvalidArgument("var and m do not have the same shape", var.shape().DebugString(), " ", @@ -2836,7 +2831,7 @@ class ApplyAdaMaxOp : public OpKernel { const Device& device = ctx->template eigen_device(); functor::ApplyAdaMax()( device, var.flat(), m.flat(), v.flat(), - beta1_power.scalar(), beta2_power.scalar(), lr.scalar(), + beta1_power.scalar(), lr.scalar(), beta1.scalar(), beta2.scalar(), epsilon.scalar(), grad.flat()); @@ -2873,7 +2868,6 @@ namespace functor { const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::Flat m, typename TTypes::Flat v, \ typename TTypes::ConstScalar beta1_power, \ - typename TTypes::ConstScalar beta2_power, \ typename TTypes::ConstScalar lr, \ typename TTypes::ConstScalar beta1, \ typename TTypes::ConstScalar beta2, \ diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index 74acc12d502..f536a61eb06 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -144,7 +144,6 @@ struct ApplyAdaMax { void operator()(const Device& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, typename TTypes::ConstScalar beta1_power, - typename TTypes::ConstScalar beta2_power, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index 1a6fc264227..2aa17f2a0f3 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -147,7 +147,6 @@ struct ApplyAdaMax { void operator()(const GPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat m, typename TTypes::Flat v, typename TTypes::ConstScalar beta1_power, - typename TTypes::ConstScalar beta2_power, typename TTypes::ConstScalar lr, typename TTypes::ConstScalar beta1, typename TTypes::ConstScalar beta2, diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index 99176cec551..dc7b588898c 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -737,12 +737,29 @@ REGISTER_OP("ResourceApplyAdam") return ApplyAdamShapeFn(c, false /* sparse */); }); +static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) { + ShapeHandle unused; + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // lr + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // beta1 + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // beta2 + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // epsilon + TF_RETURN_IF_ERROR( + HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s)); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } + return Status::OK(); +} + REGISTER_OP("ApplyAdaMax") .Input("var: Ref(T)") .Input("m: Ref(T)") .Input("v: Ref(T)") .Input("beta1_power: T") - .Input("beta2_power: T") .Input("lr: T") .Input("beta1: T") .Input("beta2: T") @@ -752,7 +769,7 @@ REGISTER_OP("ApplyAdaMax") .Attr("T: numbertype") .Attr("use_locking: bool = false") .SetShapeFn([](InferenceContext* c) { - return ApplyAdamShapeFn(c, false /* sparse */); + return ApplyAdaMaxShapeFn(c, false /* sparse */); }); REGISTER_OP("ResourceApplyAdaMax") @@ -760,7 +777,6 @@ REGISTER_OP("ResourceApplyAdaMax") .Input("m: resource") .Input("v: resource") .Input("beta1_power: T") - .Input("beta2_power: T") .Input("lr: T") .Input("beta1: T") .Input("beta2: T") @@ -769,7 +785,7 @@ REGISTER_OP("ResourceApplyAdaMax") .Attr("T: numbertype") .Attr("use_locking: bool = false") .SetShapeFn([](InferenceContext* c) { - return ApplyAdamShapeFn(c, false /* sparse */); + return ApplyAdaMaxShapeFn(c, false /* sparse */); }); static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) { From 5ca9fedc6b3f9619a3bcf7a5a4a523668055f57d Mon Sep 17 00:00:00 2001 From: imsheridan Date: Mon, 2 Apr 2018 13:02:01 +0800 Subject: [PATCH 0053/1734] Fix adam optimizer related math equation rendering format --- .../opt/python/training/lazy_adam_optimizer.py | 6 +++--- tensorflow/contrib/optimizer_v2/adam.py | 16 ++++++++-------- .../api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++---- .../base_api/api_def_ResourceApplyAdam.pbtxt | 8 ++++---- tensorflow/python/training/adam.py | 16 ++++++++-------- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py index aeca900bc8f..72117c1e81a 100644 --- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py +++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py @@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer): epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) - # m := beta1 * m + (1 - beta1) * g_t + # \\(m := beta1 * m + (1 - beta1) * g_t\\) m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) - # v := beta2 * v + (1 - beta2) * (g_t * g_t) + # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) - # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) + # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 42b7f92a76c..e863ca12442 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -41,21 +41,21 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: ``` - m_0 <- 0 (Initialize initial 1st moment vector) - v_0 <- 0 (Initialize initial 2nd moment vector) - t <- 0 (Initialize timestep) + \\(m_0 <- 0\\) (Initialize initial 1st moment vector) + \\(v_0 <- 0\\) (Initialize initial 2nd moment vector) + \\(t <- 0\\) (Initialize timestep) ``` The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: ``` - t <- t + 1 - lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) + $$t <- t + 1$$ + $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$ - m_t <- beta1 * m_{t-1} + (1 - beta1) * g - v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g - variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) + $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$ + $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$ + $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$ ``` The default value of 1e-8 for epsilon might not be a good default in diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index c2858a1bfbb..9bffaa79f5b 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Mon, 2 Apr 2018 13:11:26 +0800 Subject: [PATCH 0054/1734] Fix minor typo --- tensorflow/contrib/optimizer_v2/adam.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index e863ca12442..9bc160c0b94 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -51,11 +51,11 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): ``` $$t <- t + 1$$ - $$lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)$$ + $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - $$m_t <- beta1 * m_{t-1} + (1 - beta1) * g$$ - $$v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g$$ - $$variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)$$ + $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ ``` The default value of 1e-8 for epsilon might not be a good default in From 41074cd435a5d8b3831db8333b3669877b15a2c9 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Mon, 2 Apr 2018 13:14:48 +0800 Subject: [PATCH 0055/1734] Fix minor typo --- tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++---- tensorflow/python/training/adam.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index 9bffaa79f5b..fc2cb094716 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Tue, 3 Apr 2018 00:18:32 +0800 Subject: [PATCH 0056/1734] Fix minor typo --- .../api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt index 109b68e472f..5c60fa3aa15 100644 --- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt @@ -76,9 +76,8 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Tue, 3 Apr 2018 18:09:46 +0800 Subject: [PATCH 0057/1734] CLN: fix wrong hanging indentation --- tensorflow/contrib/opt/python/training/adamax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py index ba9e79be99b..4692f88349d 100644 --- a/tensorflow/contrib/opt/python/training/adamax.py +++ b/tensorflow/contrib/opt/python/training/adamax.py @@ -186,6 +186,6 @@ class AdaMaxOptimizer(adam.AdamOptimizer): beta1_power = self._get_beta_accumulators() with ops.colocate_with(beta1_power): update_beta1 = beta1_power.assign( - beta1_power * self._beta1_t, use_locking=self._use_locking) + beta1_power * self._beta1_t, use_locking=self._use_locking) return control_flow_ops.group(*update_ops + [update_beta1], name=name_scope) From c3c3fb62f34213f96a6c9bb4174240168d8b5873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Tue, 3 Apr 2018 18:10:18 +0800 Subject: [PATCH 0058/1734] CLN: add deps: egaer:context --- tensorflow/contrib/opt/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index aaf00128081..39a86dbd717 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -44,6 +44,7 @@ py_library( "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", + "//tensorflow/python/eager:context", "//third_party/py/numpy", "@six_archive//:six", ], From 9e1be727f1427284df4dda77f47a686cac07d098 Mon Sep 17 00:00:00 2001 From: Wenhao Hu Date: Wed, 4 Apr 2018 01:33:08 +0900 Subject: [PATCH 0059/1734] add functional_ops to BUILD --- tensorflow/python/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 3cbeb34c547..8b65b3f0576 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1916,6 +1916,7 @@ py_library( ":array_ops", ":dtypes", ":framework_ops", + ":functional_ops", ":linalg_ops_gen", ":math_ops", "//third_party/py/numpy", From e7f3ed2477c7910e68573880efd2310e149ca785 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Wed, 4 Apr 2018 10:52:49 -0700 Subject: [PATCH 0060/1734] Fixing a unit test failure for INTEL MKL where memeory allocation check failed because of use of INTEL MKL --- .../direct_session_with_tracking_alloc_test.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index 31fb128f937..0ff022a8bce 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -101,11 +101,24 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { EXPECT_EQ(2, shape.dim_size()); EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); +#ifndef INTEL_MKL + // if MKL is used, it goes through various additional + // graph rewrite pass. In TF, everytime a graph pass + // happens, "constant" nodes are allocated + // and deallocated. Each allocation calls the + // (FindChunkPtr of BFCAllocator) + // , which increments the value of AllocationId. + // Thus AllocationId becomes more than 3 and 4 if + // MKL is used, they can be 10 and 11 or + // other numbers. If MKL is used + // following check will not hold. + // Thus, skipping the check if MKL is used. if (node->name() == y->name()) { EXPECT_EQ(3, cm->AllocationId(node, 0)); } else { EXPECT_EQ(4, cm->AllocationId(node, 0)); } +#endif } EXPECT_LE(0, cm->MaxExecutionTime(node)); EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node)); From 0b9eedd684b4085ab65d60627efa8594a92a0b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 7 Apr 2018 11:47:03 +0800 Subject: [PATCH 0061/1734] TST: add test case for duplicate indices --- .../kernel_tests/scatter_nd_ops_test.py | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py index 03b2f892c62..dfe9600dbb2 100644 --- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py +++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py @@ -366,13 +366,35 @@ class ScatterNdTest(test.TestCase): def testString(self): indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32) - updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string) + updates = constant_op.constant(["four", "three", "one", "seven"], + dtype=dtypes.string) expected = np.array(["", "one", "", "three", "four", "", "", "seven"]) scatter = self.scatter_nd(indices, updates, shape=(8,)) - with self.test_session() as sess: result = sess.run(scatter) - self.assertTrue(np.array_equal(result, expected)) + self.assertAllEqual(expected, result) + + # Same indice is updated twice by same value. + indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32) + updates = constant_op.constant(["a", "b", "b", "c"], + dtype=dtypes.string) + expected = np.array(["", "", "", "bb", "a", "", "", "c"]) + scatter = self.scatter_nd(indices, updates, shape=(8,)) + with self.test_session() as sess: + result = sess.run(scatter) + self.assertAllEqual(expected, result) + + # Same indice is updated twice by different value. + indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32) + updates = constant_op.constant(["a", "b", "c", "d"], + dtype=dtypes.string) + expected = [np.array(["", "", "", "bc", "a", "", "", "d"]), + np.array(["", "", "", "cb", "a", "", "", "d"])] + scatter = self.scatter_nd(indices, updates, shape=(8,)) + with self.test_session() as sess: + result = sess.run(scatter) + self.assertTrue(np.array_equal(result, expected[0]) or + np.array_equal(result, expected[1])) def testRank3ValidShape(self): indices = array_ops.zeros([2, 2, 2], dtypes.int32) From 9e1bbbc0fb770f077d9de295b53181e3592f1d24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Sat, 7 Apr 2018 12:07:11 +0800 Subject: [PATCH 0062/1734] DOC: remove the misleading 'empty tensor' --- tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt index 4e95895f548..58753a651a1 100644 --- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt @@ -25,7 +25,7 @@ A new tensor with the given shape and updates applied according to the indices. END } - summary: "Scatter `updates` into a new empty tensor according to `indices`." + summary: "Scatter `updates` into a new tensor according to `indices`." description: < Date: Sat, 7 Apr 2018 22:42:10 +0900 Subject: [PATCH 0063/1734] move dependency --- tensorflow/python/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 9dad747ac0b..7d40c133c4f 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1970,6 +1970,7 @@ py_library( ":array_ops", ":control_flow_ops", ":framework_for_generated_wrappers", + ":functional_ops", ":linalg_ops", ":math_ops", "//tensorflow/python/ops/linalg:linalg_impl", @@ -1984,7 +1985,6 @@ py_library( ":array_ops", ":dtypes", ":framework_ops", - ":functional_ops", ":linalg_ops_gen", ":math_ops", "//third_party/py/numpy", From 7c95ee3ca48f4e50818f12daf749cbe050a8643f Mon Sep 17 00:00:00 2001 From: Brett Koonce Date: Sun, 18 Mar 2018 13:41:12 -0700 Subject: [PATCH 0064/1734] contrib: minor spelling tweaks packages: data training tensor_forest --- .../python/kernel_tests/dataset_serialization_test_base.py | 2 +- .../data/python/kernel_tests/interleave_dataset_op_test.py | 4 ++-- tensorflow/contrib/data/python/ops/scan_ops.py | 2 +- tensorflow/contrib/tensor_forest/client/random_forest.py | 2 +- .../hybrid/core/ops/hard_routing_function_op.cc | 2 +- .../hybrid/core/ops/stochastic_hard_routing_function_op.cc | 2 +- .../hybrid/core/ops/stochastic_hard_routing_gradient_op.cc | 2 +- tensorflow/contrib/tensor_forest/kernels/tree_utils.cc | 4 ++-- tensorflow/contrib/tensor_forest/kernels/tree_utils.h | 2 +- .../tensor_forest/kernels/v4/decision-tree-resource.h | 2 +- .../tensor_forest/kernels/v4/decision_node_evaluator.h | 2 +- tensorflow/contrib/tensor_forest/ops/model_ops.cc | 2 +- tensorflow/contrib/tensor_forest/ops/stats_ops.cc | 4 ++-- tensorflow/contrib/tensor_forest/python/tensor_forest.py | 2 +- tensorflow/contrib/training/python/training/resample.py | 2 +- tensorflow/contrib/training/python/training/sampling_ops.py | 6 +++--- .../python/training/sequence_queueing_state_saver.py | 4 ++-- 17 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py index dbc35097ddd..78ecce8f7da 100644 --- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py +++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py @@ -163,7 +163,7 @@ class DatasetSerializationTestBase(test.TestCase): num_outputs, sparse_tensors=False, verify_exhausted=True): - """Verifies that restoring into an already initilized iterator works. + """Verifies that restoring into an already initialized iterator works. Args: ds_fn: See `run_core_tests`. diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py index 256ad8d94dc..6a88a7caf6c 100644 --- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py @@ -338,7 +338,7 @@ class ParallelInterleaveDatasetTest(test.TestCase): def _testTwoThreadsNoContentionWithRaces(self, sloppy=False): """Tests where all the workers race in producing elements. - Note: this is in contrast with the prevous test which carefully sequences + Note: this is in contrast with the previous test which carefully sequences the execution of the map functions. Args: @@ -424,7 +424,7 @@ class ParallelInterleaveDatasetTest(test.TestCase): def _testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy=False): """Tests where all the workers race in producing elements. - Note: this is in contrast with the prevous test which carefully sequences + Note: this is in contrast with the previous test which carefully sequences the execution of the map functions. diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py index 1c88366273f..fe49ee8b194 100644 --- a/tensorflow/contrib/data/python/ops/scan_ops.py +++ b/tensorflow/contrib/data/python/ops/scan_ops.py @@ -57,7 +57,7 @@ class _ScanDataset(dataset_ops.Dataset): self._output_shapes = None self._output_types = None - # Iteratively rerun the scan function until reaching a fixed pont on + # Iteratively rerun the scan function until reaching a fixed point on # `self._state_shapes`. need_to_rerun = True while need_to_rerun: diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py index 4abcc20ed33..35e8c92aba3 100644 --- a/tensorflow/contrib/tensor_forest/client/random_forest.py +++ b/tensorflow/contrib/tensor_forest/client/random_forest.py @@ -399,7 +399,7 @@ def get_combined_model_fn(model_fns): training ops: tf.group them. loss: average them. predictions: concat probabilities such that predictions[*][0-C1] are the - probablities for output 1 (where C1 is the number of classes in output 1), + probabilities for output 1 (where C1 is the number of classes in output 1), predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2 is the number of classes in output 2), etc. Also stack predictions such that predictions[i][j] is the class prediction for example i and output j. diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc index cf0db788a41..06bfe871fdf 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc @@ -80,7 +80,7 @@ REGISTER_OP("HardRoutingFunction") regression model that translates from node features to probabilities. - path_probility: `path_probability[i]` gives the probability of reaching each + path_probability: `path_probability[i]` gives the probability of reaching each node in `path[i]`. path: `path[i][j]` gives the jth node in the path taken by the ith data instance. diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc index c9df09bfda4..1a055756c08 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc @@ -85,7 +85,7 @@ REGISTER_OP("StochasticHardRoutingFunction") regression model that translates from node features to probabilities. - path_probility: `path_probability[i]` gives the probability of reaching each + path_probability: `path_probability[i]` gives the probability of reaching each node in `path[i]`. path: `path[i][j]` gives the jth node in the path taken by the ith data instance. diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc index b0d8b832b54..7d092bbc24d 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc @@ -81,7 +81,7 @@ REGISTER_OP("StochasticHardRoutingGradient") tree_biases: `tree_biases[i]` gives the bias of the logistic regression model that translates from node features to probabilities. - path_probility: `path_probability[i]` gives the probability of reaching each + path_probability: `path_probability[i]` gives the probability of reaching each node in `path[i]`. path: `path[i][j]` gives the jth node in the path taken by the ith data instance. diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc index 44997ec5d6d..cefcc960510 100644 --- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc +++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc @@ -421,7 +421,7 @@ double getChebyshevEpsilon(const std::vector& mu1, const std::vector& mu2) { // Math time!! // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface. - // Using Langrange multipliers, we get + // Using Lagrange multipliers, we get // partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x // partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y // or @@ -485,7 +485,7 @@ double getChebyshevEpsilon(const std::vector& mu1, } double sdiscrim = sqrt(discrim); - // TODO(thomaswc): Analyze whetever one of these is always closer. + // TODO(thomaswc): Analyze whatever one of these is always closer. double v1 = (-b + sdiscrim) / (2 * a); double v2 = (-b - sdiscrim) / (2 * a); double dist1 = getDistanceFromLambda3(v1, mu1, mu2); diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h index edbac670067..03aab1b61ee 100644 --- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h +++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h @@ -123,7 +123,7 @@ bool BestSplitDominatesRegression(const Tensor& total_sums, const Tensor& split_squares, int32 accumulator); -// Performs booststrap_samples bootstrap samples of the best split's class +// Performs bootstrap_samples bootstrap samples of the best split's class // counts and the second best splits's class counts, and returns true if at // least dominate_fraction of the time, the former has a better (lower) // Gini impurity. Does not take over ownership of *rand. diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h index 328af28725a..d3edb437337 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h @@ -60,7 +60,7 @@ class DecisionTreeResource : public ResourceBase { mutex* get_mutex() { return &mu_; } // Return the TreeNode for the leaf that the example ends up at according - // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr. + // to decision_tree_. Also fill in that leaf's depth if it isn't nullptr. int32 TraverseTree(const std::unique_ptr& input_data, int example, int32* depth, TreePath* path) const; diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h index bf2b2aaa3c8..3db351c328c 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h @@ -60,7 +60,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator { bool include_equals_; }; -// Evalutor for splits with multiple weighted features. +// Evaluator for splits with multiple weighted features. class ObliqueInequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator { public: diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc index 3099cccdf8b..98124d519c7 100644 --- a/tensorflow/contrib/tensor_forest/ops/model_ops.cc +++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc @@ -165,7 +165,7 @@ tree_handle: The handle to the tree. leaf_ids: `leaf_ids[i]` is the leaf id for input i. input_labels: The training batch's labels as a 1 or 2-d tensor. 'input_labels[i][j]' gives the j-th label/target for the i-th input. -input_weights: The training batch's eample weights as a 1-d tensor. +input_weights: The training batch's weights as a 1-d tensor. 'input_weights[i]' gives the weight for the i-th input. )doc"); diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc index e8b5c5d8a6e..be0a11546d2 100644 --- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc +++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc @@ -83,7 +83,7 @@ Grows the tree for finished nodes and allocates waiting nodes. params: A serialized TensorForestParams proto. tree_handle: The handle to the tree. stats_handle: The handle to the stats. -finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput. +finished_nodes: A 1-d Tensor of finished node ids from ProcessInput. )doc"); REGISTER_OP("ProcessInputV4") @@ -119,7 +119,7 @@ sparse_input_values: The values tensor from the SparseTensor input. sparse_input_shape: The shape tensor from the SparseTensor input. input_labels: The training batch's labels as a 1 or 2-d tensor. 'input_labels[i][j]' gives the j-th label/target for the i-th input. -input_weights: The training batch's eample weights as a 1-d tensor. +input_weights: The training batch's weights as a 1-d tensor. 'input_weights[i]' gives the weight for the i-th input. finished_nodes: A 1-d tensor of node ids that have finished and are ready to grow. diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py index 3650b5d52fe..b9bcbb170b0 100644 --- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py +++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py @@ -212,7 +212,7 @@ class ForestHParams(object): self.regression = getattr(self, 'regression', False) # Num_outputs is the actual number of outputs (a single prediction for - # classification, a N-dimenensional point for regression). + # classification, a N-dimensional point for regression). self.num_outputs = self.num_classes if self.regression else 1 # Add an extra column to classes for storing counts, which is needed for diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py index b16159bc16b..7b8332b1d67 100644 --- a/tensorflow/contrib/training/python/training/resample.py +++ b/tensorflow/contrib/training/python/training/resample.py @@ -77,7 +77,7 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False): Args: inputs: A list of tensors, each of which has a shape of `[batch_size, ...]` - rates: A tensor of shape `[batch_size]` contiaining the resampling rates + rates: A tensor of shape `[batch_size]` containing the resampling rates for each input. scope: Scope for the op. seed: Random seed to use. diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py index ba888f87dc8..7140f2a46d5 100644 --- a/tensorflow/contrib/training/python/training/sampling_ops.py +++ b/tensorflow/contrib/training/python/training/sampling_ops.py @@ -123,7 +123,7 @@ def rejection_sample(tensors, batch_size=batch_size, num_threads=queue_threads) - # Queues return a single tensor if the list of enqued tensors is one. Since + # Queues return a single tensor if the list of enqueued tensors is one. Since # we want the type to always be the same, always return a list. if isinstance(minibatch, ops.Tensor): minibatch = [minibatch] @@ -312,7 +312,7 @@ def _verify_input(tensor_list, labels, probs_list): """Verify that batched inputs are well-formed.""" checked_probs_list = [] for probs in probs_list: - # Since number of classes shouldn't change at runtime, probalities shape + # Since number of classes shouldn't change at runtime, probabilities shape # should be fully defined. probs.get_shape().assert_is_fully_defined() @@ -407,7 +407,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs): ``` - A solution for a_i in terms of the other variabes is the following: + A solution for a_i in terms of the other variables is the following: ```a_i = (t_i / p_i) / max_i[t_i / p_i]``` """ # Make list of t_i / p_i. diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py index 99d486b1833..39d75a08060 100644 --- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py +++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py @@ -876,7 +876,7 @@ class SequenceQueueingStateSaver(object): ]): self._length = array_ops.identity(self._length) - # Only create barrier; enqueu and dequeue operations happen when you + # Only create barrier; enqueue and dequeue operations happen when you # access prefetch_op and next_batch. self._create_barrier() self._scope = scope @@ -1637,7 +1637,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll): For `key, value` pairs in `input_context` with `SparseTensor` `value` removes them from `input_context` and transforms the `value` into a sequence and - then adding `key`, transformed `value` into `input_seuqences`. + then adding `key`, transformed `value` into `input_sequences`. The transformation is done by adding a new first dimension of `value_length` equal to that of the other values in input_sequences` and tiling the `value` every `num_unroll` steps. From 61994c21f5ddee273e0d79b08444b48858e11bfd Mon Sep 17 00:00:00 2001 From: imsheridan Date: Tue, 10 Apr 2018 20:00:22 +0800 Subject: [PATCH 0065/1734] Remove breaking ``` for math equations --- tensorflow/contrib/optimizer_v2/adam.py | 4 ---- tensorflow/python/training/adam.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 9bc160c0b94..a38c98f4711 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - ``` \\(m_0 <- 0\\) (Initialize initial 1st moment vector) \\(v_0 <- 0\\) (Initialize initial 2nd moment vector) \\(t <- 0\\) (Initialize timestep) - ``` The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - ``` $$t <- t + 1$$ $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ - ``` The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 1f2c40f18ea..dc0f1aba09a 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -43,23 +43,19 @@ class AdamOptimizer(optimizer.Optimizer): Initialization: - ``` \\(m_0 <- 0\\) (Initialize initial 1st moment vector) \\(v_0 <- 0\\) (Initialize initial 2nd moment vector) \\(t <- 0\\) (Initialize timestep) - ``` The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - ``` $$t <- t + 1$$ $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ - ``` The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a From 1f9eeeb842a052326da766a626b32b2e7a50ffcc Mon Sep 17 00:00:00 2001 From: Anna R Date: Tue, 10 Apr 2018 10:50:01 -0700 Subject: [PATCH 0066/1734] Adding release notes for 1.8.0rc0 --- RELEASE.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index e8459531748..6ec03f94d88 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,62 @@ +# Release 1.8.0 + +## Major Features And Improvements +* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine. +* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory. +* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor. +* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability. +* `tf.contrib.bayesflow` is moving out to it's own repo. +* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication. + +## Bug Fixes and Other Changes +* `tf.data`: + * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory. + * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment. + * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files. + * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators in eager mode. +* Eager Execution: + * Can now naturally iterate over `tf.data.Dataset` objects without wrapping in a `tf.contrib.eager.Iterator`. For example: `for x in tf.data.Dataset.range(10): print(x)` + * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133) + * `tf.GradientTape` has moved out of contrib. +* `tf.keras`: + * Added the fashion mnist dataset. + * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`. +* Accelerated Linear Algebra (XLA): + * Select and scatter in reference util and evaluator now use lexicographical order to break ties. +* TensorFlow Debugger (tfdbg) CLI: + * During tensor-filter operations, allow exclusion of nodes by regular expressions. + * Fix spurious background colors in some text terminals. +* tf.contrib: + * Add meta-distribution BatchReshape which reshapes batch dimensions. + * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU. + * Add `tf.contrib.framework.argsort`. + * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses. + * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`. + * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched. +* Other: + * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch. + * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`. + * Update scatter operations: + * Add `tf.scatter_min` and `tf.scatter_max` + * Extend scatter operations to work with a scalar update parameter. + * Move cuDNN RNN ops to core for use in TensorFlow codebase only. + * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`. + * Add `float64` support for `AvgPool`/`AvgPoolGrad`. + * Make graph name scope thread local so that they work correctly in multi-threaded environments. + * Update nsync synchronization library to avoid slow primitives on Linux. + * Removed need to put nsync/public on C include path when building custom ops. + * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`. + * Add links to https://js.tensorflow.org. + * Fix non-uniformity of orthogonal matrices. + * Fix bug where multi-image Estimator eval summaries were not displayed correctly. + +## Thanks to our Contributors + +This release contains contributions from many people at Google, as well as: + +4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu + + # Release 1.7.0 ## Major Features And Improvements From c2582d40474211877764b5ac24d412384d20bd25 Mon Sep 17 00:00:00 2001 From: Anna R Date: Tue, 10 Apr 2018 11:04:32 -0700 Subject: [PATCH 0067/1734] Update a few release notes --- RELEASE.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 6ec03f94d88..83c14200ec2 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,9 +13,8 @@ * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory. * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment. * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files. - * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators in eager mode. * Eager Execution: - * Can now naturally iterate over `tf.data.Dataset` objects without wrapping in a `tf.contrib.eager.Iterator`. For example: `for x in tf.data.Dataset.range(10): print(x)` + * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled. * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133) * `tf.GradientTape` has moved out of contrib. * `tf.keras`: @@ -24,8 +23,8 @@ * Accelerated Linear Algebra (XLA): * Select and scatter in reference util and evaluator now use lexicographical order to break ties. * TensorFlow Debugger (tfdbg) CLI: - * During tensor-filter operations, allow exclusion of nodes by regular expressions. - * Fix spurious background colors in some text terminals. + * During tensor-filter operations, allow exclusion of nodes by regular expressions. + * Fix spurious background colors in some text terminals. * tf.contrib: * Add meta-distribution BatchReshape which reshapes batch dimensions. * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU. From e5d12651d3ff1accab74c79a9905e7ec3a05bfc2 Mon Sep 17 00:00:00 2001 From: Anna R Date: Tue, 10 Apr 2018 11:06:26 -0700 Subject: [PATCH 0068/1734] Formatting fix --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 83c14200ec2..2717c75740a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -25,7 +25,7 @@ * TensorFlow Debugger (tfdbg) CLI: * During tensor-filter operations, allow exclusion of nodes by regular expressions. * Fix spurious background colors in some text terminals. -* tf.contrib: +* `tf.contrib`: * Add meta-distribution BatchReshape which reshapes batch dimensions. * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU. * Add `tf.contrib.framework.argsort`. From b8fe5bf30662155ae351b3dc794456d2c68b151c Mon Sep 17 00:00:00 2001 From: Anna R Date: Tue, 10 Apr 2018 11:13:35 -0700 Subject: [PATCH 0069/1734] Update version for 1.8.0rc0 --- tensorflow/core/public/version.h | 4 ++-- tensorflow/docs_src/install/install_c.md | 2 +- tensorflow/docs_src/install/install_go.md | 2 +- tensorflow/docs_src/install/install_java.md | 22 +++++++++---------- tensorflow/docs_src/install/install_linux.md | 22 +++++++++---------- tensorflow/docs_src/install/install_mac.md | 10 ++++----- .../docs_src/install/install_sources.md | 9 ++++++-- tensorflow/tools/docker/Dockerfile.devel | 2 +- .../tools/docker/Dockerfile.devel-cpu-mkl | 2 +- tensorflow/tools/docker/Dockerfile.devel-gpu | 2 +- tensorflow/tools/pip_package/setup.py | 2 +- 11 files changed, 42 insertions(+), 37 deletions(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 706968d3474..0ca7d8475fc 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -19,12 +19,12 @@ limitations under the License. // TensorFlow uses semantic versioning, see http://semver.org/. #define TF_MAJOR_VERSION 1 -#define TF_MINOR_VERSION 7 +#define TF_MINOR_VERSION 8 #define TF_PATCH_VERSION 0 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1", // "-beta", "-rc", "-rc.1") -#define TF_VERSION_SUFFIX "" +#define TF_VERSION_SUFFIX "-rc0" #define TF_STR_HELPER(x) #x #define TF_STR(x) TF_STR_HELPER(x) diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md index 274413e2944..995b8ae6663 100644 --- a/tensorflow/docs_src/install/install_c.md +++ b/tensorflow/docs_src/install/install_c.md @@ -38,7 +38,7 @@ enable TensorFlow for C: OS="linux" # Change to "darwin" for macOS TARGET_DIRECTORY="/usr/local" curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md index 1a0956634d6..2938a8f7eef 100644 --- a/tensorflow/docs_src/install/install_go.md +++ b/tensorflow/docs_src/install/install_go.md @@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go: TF_TYPE="cpu" # Change to "gpu" for GPU support TARGET_DIRECTORY='/usr/local' curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md index cdde45a6f4f..c87eacfa939 100644 --- a/tensorflow/docs_src/install/install_java.md +++ b/tensorflow/docs_src/install/install_java.md @@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs: org.tensorflow tensorflow - 1.7.0 + 1.8.0-rc0 ``` @@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow: org.tensorflow tensorflow - 1.7.0 + 1.8.0-rc0 @@ -123,12 +123,12 @@ instead: org.tensorflow libtensorflow - 1.7.0 + 1.8.0-rc0 org.tensorflow libtensorflow_jni_gpu - 1.7.0 + 1.8.0-rc0 ``` @@ -147,7 +147,7 @@ refer to the simpler instructions above instead. Take the following steps to install TensorFlow for Java on Linux or macOS: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar), which is the TensorFlow Java Archive (JAR). 2. Decide whether you will run TensorFlow for Java on CPU(s) only or with @@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS: OS=$(uname -s | tr '[:upper:]' '[:lower:]') mkdir -p ./jni curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" | tar -xz -C ./jni ### Install on Windows @@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS: Take the following steps to install TensorFlow for Java on Windows: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar), which is the TensorFlow Java Archive (JAR). 2. Download the following Java Native Interface (JNI) file appropriate for - [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip). + [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip). 3. Extract this .zip file. @@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the downloaded `.jar` in your `classpath` by using the `-cp` compilation flag as follows: -
javac -cp libtensorflow-1.7.0.jar HelloTF.java
+
javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java
### Running @@ -239,11 +239,11 @@ two files are available to the JVM: For example, the following command line executes the `HelloTF` program on Linux and macOS X: -
java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF
+
java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF
And the following command line executes the `HelloTF` program on Windows: -
java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF
+
java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF
If the program prints Hello from version, you've successfully installed TensorFlow for Java and are ready to use the API. If the program diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index 04e4242b0ff..8387289fcf2 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -194,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv: Virtualenv environment:
(tensorflow)$ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl If you encounter installation problems, see [Common Installation Problems](#common_installation_problems). @@ -299,7 +299,7 @@ take the following steps:
      $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
      
If this step fails, see @@ -485,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      (tensorflow)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl ## Validate your installation @@ -659,14 +659,14 @@ This section documents the relevant values for Linux installations. CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -678,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -697,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 
@@ -716,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md index b3e9616a059..a237d1af540 100644 --- a/tensorflow/docs_src/install/install_mac.md +++ b/tensorflow/docs_src/install/install_mac.md @@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv: TensorFlow in the active Virtualenv is as follows:
 $ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl If you encounter installation problems, see [Common Installation Problems](#common-installation-problems). @@ -242,7 +242,7 @@ take the following steps: issue the following command:
 $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl 
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl If the preceding command fails, see [installation problems](#common-installation-problems). @@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment: TensorFlow for Python 2.7:
 (targetDirectory)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl @@ -524,7 +524,7 @@ The value you specify depends on your Python version.
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index 7d7c2aa75ae..677e3329b6b 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package. The filename of the `.whl` file depends on your platform. For example, the following command will install the pip package -for TensorFlow 1.7.0 on Linux: +for TensorFlow 1.8.0rc0 on Linux:
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl
 
## Validate your installation @@ -450,6 +450,8 @@ Stack Overflow and specify the `tensorflow` tag. **Linux** + + @@ -471,6 +473,7 @@ Stack Overflow and specify the `tensorflow` tag. **Mac**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.8.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.7.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.7.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.6.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.0N/AN/A
+ @@ -486,6 +489,8 @@ Stack Overflow and specify the `tensorflow` tag. **Windows**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.7.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.6.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.5.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
+ + diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index 11f476d12c0..0563bd4d6c5 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -70,7 +70,7 @@ RUN mkdir /bazel && \ # Download and build TensorFlow. WORKDIR /tensorflow -RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git . +RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git . # TODO(craigcitro): Don't install the pip package, since it makes it # more difficult to experiment with local changes. Instead, just add diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl index 037d13116ef..c65e0b72bc5 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl +++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl @@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel LABEL maintainer="Clayne Robison" # These arguments are parameterized. Use --build-args to override. -ARG TF_BRANCH=r1.7 +ARG TF_BRANCH=r1.8 ARG WHL_DIR=/whl RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 1fcb6428b21..9f0cf63e7e2 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -79,7 +79,7 @@ RUN mkdir /bazel && \ # Download and build TensorFlow. WORKDIR /tensorflow -RUN git clone --branch=r1.7 --depth=1 https://github.com/tensorflow/tensorflow.git . +RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git . # Configure the build for our CUDA configuration. ENV CI_BUILD_PYTHON python diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 6511a50b3bb..f676f040ad3 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -29,7 +29,7 @@ from setuptools.dist import Distribution # This version string is semver compatible, but incompatible with pip. # For pip, we will remove all '-' characters from this string, and use the # result for pip. -_VERSION = '1.7.0' +_VERSION = '1.8.0-rc0' REQUIRED_PACKAGES = [ 'absl-py >= 0.1.6', From 6f6f913bc2e9866d70e0615fcae22371d32eee86 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 10 Apr 2018 11:19:26 -0700 Subject: [PATCH 0070/1734] Adding the python symlink command for devel packages too. --- tensorflow/tools/docker/Dockerfile.devel | 2 ++ tensorflow/tools/docker/Dockerfile.devel-gpu | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index 0563bd4d6c5..f2415930d5e 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \ && \ python -m ipykernel.kernelspec +# RUN ln -s /usr/bin/python3 /usr/bin/python# + # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 9f0cf63e7e2..1d198219685 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \ && \ python -m ipykernel.kernelspec +# RUN ln -s /usr/bin/python3 /usr/bin/python# + # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ From fd75fb4b7740c1a1b82d2252f33c4b22f1f47e0f Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 10 Apr 2018 14:59:23 -0700 Subject: [PATCH 0071/1734] Forcing the symlink creation. --- tensorflow/tools/docker/Dockerfile | 2 +- tensorflow/tools/docker/Dockerfile.devel | 2 +- tensorflow/tools/docker/Dockerfile.devel-gpu | 2 +- tensorflow/tools/docker/Dockerfile.gpu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile index 024cb40eb4b..78cb4d250e8 100644 --- a/tensorflow/tools/docker/Dockerfile +++ b/tensorflow/tools/docker/Dockerfile @@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \ http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- # -# RUN ln -s /usr/bin/python3 /usr/bin/python# +# RUN ln -s -f /usr/bin/python3 /usr/bin/python# # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index f2415930d5e..390d7442c37 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -38,7 +38,7 @@ RUN pip --no-cache-dir install \ && \ python -m ipykernel.kernelspec -# RUN ln -s /usr/bin/python3 /usr/bin/python# +# RUN ln -s -f /usr/bin/python3 /usr/bin/python# # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 1d198219685..293028d229a 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \ && \ python -m ipykernel.kernelspec -# RUN ln -s /usr/bin/python3 /usr/bin/python# +# RUN ln -s -f /usr/bin/python3 /usr/bin/python# # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu index 625321e1235..9e1708662e7 100644 --- a/tensorflow/tools/docker/Dockerfile.gpu +++ b/tensorflow/tools/docker/Dockerfile.gpu @@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \ http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- # -# RUN ln -s /usr/bin/python3 /usr/bin/python# +# RUN ln -s -f /usr/bin/python3 /usr/bin/python# # Set up our notebook config. COPY jupyter_notebook_config.py /root/.jupyter/ From 69342d7a6c61c4aa2ca42ac010ed0e66f0b89755 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 10 Apr 2018 16:10:13 -0700 Subject: [PATCH 0072/1734] Updating the sed command for docker parameterized build. --- tensorflow/tools/docker/parameterized_docker_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh index b4fba5b8f5e..05de25f2cb1 100755 --- a/tensorflow/tools/docker/parameterized_docker_build.sh +++ b/tensorflow/tools/docker/parameterized_docker_build.sh @@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \ sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \ sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \ - sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" + sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" then echo "Modified Dockerfile for python version "\ "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" @@ -306,7 +306,7 @@ else sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \ sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \ sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \ - sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" + sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}" then echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}" else From ef6637771b2582245bb15507a6796b3c3f1db6b5 Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Wed, 11 Apr 2018 20:48:32 +0900 Subject: [PATCH 0073/1734] fix typo --- tensorflow/core/framework/collective.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h index 5810c7fa547..a82fb50d880 100644 --- a/tensorflow/core/framework/collective.h +++ b/tensorflow/core/framework/collective.h @@ -178,7 +178,7 @@ class StepSequenceInterface { virtual void RefreshStepIdSequenceAsync(int64 graph_key, const StatusCallback& done) = 0; - // Returns the the step_id that should be used for initiating a new execution + // Returns the step_id that should be used for initiating a new execution // on the specified graph. May return the same step_id multiple times if // RetireStepId or RefreshStepIdReservation is not called. virtual int64 NextStepId(int64 graph_key) = 0; From acd9725e72af749c60153cd4d7efdd679c935426 Mon Sep 17 00:00:00 2001 From: ManHyuk Date: Wed, 11 Apr 2018 20:49:46 +0900 Subject: [PATCH 0074/1734] fix typo --- tensorflow/contrib/lite/toco/model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index 56ef9fe2a88..8a936842d90 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -151,7 +151,7 @@ enum class AxesOrder { }; // The type of the scalars in an array. -// Note that that does not by itself tell whether the values in the array are +// Note that does not by itself tell whether the values in the array are // real (are literally interpreted as real numbers) or quantized (only acquire // a meaning as real numbers in conjunction with QuantizationParams). // From 44fc1feaa989ea4e1fbfe49dc9ca4db3ce661659 Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 12:27:55 -0700 Subject: [PATCH 0075/1734] Relaxing float comparison and removing unneeded include --- tensorflow/contrib/layers/python/layers/rev_block_lib_test.py | 4 ++-- tensorflow/stream_executor/cuda/cudnn_version_test.cc | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py index 392a490be15..8c118402a4c 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py @@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase): sess.run(variables.global_variables_initializer()) x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv]) - self.assertAllClose(x1, x1_inv) - self.assertAllClose(x2, x2_inv) + self.assertAllClose(x1, x1_inv, atol=1e-5) + self.assertAllClose(x2, x2_inv, atol=1e-5) def testBackwardForward(self): diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc index 230adafeb11..42b3dc8cc67 100644 --- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc +++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc @@ -15,7 +15,6 @@ limitations under the License. #include "tensorflow/stream_executor/cuda/cudnn_version.h" -#include "testing/base/public/gunit.h" #include "tensorflow/core/platform/test.h" namespace perftools { From 242788aa28a838fe0e611780023d74be04606e1d Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Tue, 10 Apr 2018 19:20:58 -0700 Subject: [PATCH 0076/1734] experimental C API: Fix compilation failure in Windows. The functions added in https://github.com/tensorflow/tensorflow/commit/be917027e37c5e8f21f6ba07f24bdbf072cf6dfd are temporary, and their existence breaks compilation in MSVC because of https://docs.microsoft.com/en-us/cpp/c-language/maximum-string-length and https://docs.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026 So just disabling it in Windows for now. PiperOrigin-RevId: 192391164 --- tensorflow/c/BUILD | 1 + tensorflow/c/c_api_experimental.cc | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 2367014cd02..8a9301d5847 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -122,6 +122,7 @@ tf_cuda_library( "//tensorflow/core:core_cpu", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:lib_platform", "//tensorflow/core:protos_all_cc", ], ) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index e82a5460920..9678ee926fc 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/protobuf/config.pb.h" using tensorflow::FunctionDef; @@ -189,6 +190,12 @@ library { // be deleted by calling TF_DeleteFunction. static std::vector CreateImagenetDatasetFunctions( const char* file_path, std::string* dataset_name, TF_Status* status) { +#if defined(PLATFORM_WINDOWS) + status->status = tensorflow::errors::Unimplemented( + "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " + "is not implemented for Windows"); + return std::vector(); +#else const char* func_def = R"PREFIX( library { function { @@ -7067,6 +7074,7 @@ library { DCHECK(found); }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); +#endif } // On success, returns a set of TF_Function instances encoding a dataset @@ -7076,6 +7084,12 @@ library { static std::vector CreateMNISTDatasetFunctions( const char* file_path, int batch_size, std::string* dataset_name, TF_Status* status) { +#if defined(PLATFORM_WINDOWS) + status->status = tensorflow::errors::Unimplemented( + "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " + "is not implemented for Windows"); + return nullptr; +#else const char* func_def = R"PREFIX( library { function { @@ -8205,6 +8219,7 @@ library { DCHECK(found_batch_size); }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); +#endif } // Adds the input functions to `graph`. On success, returns the created From 9d1aa895adda8644ddbb55b5e1dbb0797ea6cbb0 Mon Sep 17 00:00:00 2001 From: Jie Date: Wed, 11 Apr 2018 14:42:15 -0700 Subject: [PATCH 0077/1734] [tftrt update] Added support for TRT plugin during conversion - converter & shape inference are now aware of plugin factory. - each plugin does serialization of plugin type & input dimensions - wrapper for nvinfer1::IPlugin & nvinfer1::PluginFactory * compatible with TRT 3.0.4 plugin API. * future plugin API changes willl be updated. --- tensorflow/contrib/tensorrt/BUILD | 26 ++++++ .../contrib/tensorrt/convert/convert_graph.cc | 4 +- .../contrib/tensorrt/convert/convert_nodes.cc | 84 ++++++++++++++--- .../contrib/tensorrt/kernels/trt_engine_op.cc | 4 +- .../contrib/tensorrt/plugin/trt_plugin.cc | 89 +++++++++++++++++++ .../contrib/tensorrt/plugin/trt_plugin.h | 81 +++++++++++++++++ .../tensorrt/plugin/trt_plugin_factory.cc | 81 +++++++++++++++++ .../tensorrt/plugin/trt_plugin_factory.h | 83 +++++++++++++++++ .../tensorrt/plugin/trt_plugin_utils.cc | 36 ++++++++ .../tensorrt/plugin/trt_plugin_utils.h | 51 +++++++++++ .../contrib/tensorrt/shape_fn/trt_shfn.cc | 4 +- 11 files changed, 528 insertions(+), 15 deletions(-) create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin.cc create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin.h create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc create mode 100644 tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index 2f316767b35..98f18835b06 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -67,6 +67,7 @@ tf_cuda_library( visibility = ["//visibility:public"], deps = [ ":trt_logging", + ":trt_plugins", ] + if_tensorrt([ "@local_config_tensorrt//:nv_infer", ]) + tf_custom_op_library_additional_deps(), @@ -86,6 +87,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":trt_logging", + ":trt_plugins", ":trt_resources", "//tensorflow/core:gpu_headers_lib", "//tensorflow/core:lib_proto_parsing", @@ -222,6 +224,7 @@ tf_cuda_library( ], deps = [ ":segment", + ":trt_plugins", ":trt_logging", ":trt_resources", "//tensorflow/core/grappler:grappler_item", @@ -272,3 +275,26 @@ tf_cc_test( "//tensorflow/core:test_main", ], ) + +# Library for the plugin factory +#cc_library( +tf_cuda_library( + name = "trt_plugins", + srcs = [ + "plugin/trt_plugin.cc", + "plugin/trt_plugin_factory.cc", + "plugin/trt_plugin_utils.cc", + ], + hdrs = [ + "plugin/trt_plugin.h", + "plugin/trt_plugin_factory.h", + "plugin/trt_plugin_utils.h", + ], + linkstatic = 1, + deps = [ + #"@protobuf_archive//:protobuf_headers", + ] + if_tensorrt([ + "@local_config_tensorrt//:nv_infer", + ]), +) + diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index b412b296e02..899e1721e6e 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/convert/convert_graph.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include #include @@ -75,7 +76,8 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) { // TODO(ben,jie): ... }; // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h) - return candidate_ops.count(node->type_string()); + return (candidate_ops.count(node->type_string()) || + PluginFactoryTensorRT::GetInstance().IsPlugin(&node->type_string())); } void GetSubGraphIncomingEdges(const tensorflow::Graph& graph, diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 567b4af88df..a03c1e224ac 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include #include @@ -246,6 +247,15 @@ class TFAttrs { return attrs_.count(key) ? this->get(key) : default_value; } + std::vector GetAllAttrKey() { + std::vector attr_list; + for (AttrMap::iterator iter = attrs_.begin(); iter != attrs_.end(); + iter++) { + attr_list.emplace_back(iter->first); + } + return attr_list; + } + private: typedef std::map AttrMap; AttrMap attrs_; @@ -262,6 +272,12 @@ std::vector TFAttrs::get>(string key) const { return std::vector(attr.begin(), attr.end()); } +template <> +std::vector TFAttrs::get>(string key) const { + auto attr = this->at(key)->list().f(); + return std::vector(attr.begin(), attr.end()); +} + template <> std::vector TFAttrs::get>(string key) const { auto attr = this->at(key)->list().s(); @@ -424,6 +440,7 @@ using OpConverter = class Converter { std::unordered_map trt_tensors_; std::unordered_map op_registry_; + OpConverter plugin_converter_; nvinfer1::INetworkDefinition* trt_network_; std::list> temp_bufs_; tensorflow::tensorrt::TRTWeightStore* weight_store_; @@ -444,8 +461,8 @@ class Converter { * remove this and annotate the edge as a control dependency. ************************************************************************/ // skip control nodes - if (input_name[0] == '^' ) continue; - string name = input_name; + if (input_name[0] == '^') continue; + string name = input_name; auto first = name.find_first_of(':'); if (first != string::npos && first + 2 == name.size() && name[first + 1] == '0') @@ -490,13 +507,17 @@ class Converter { std::vector inputs; TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs)); string op = node_def.op(); - if (!op_registry_.count(op)) { - return tensorflow::errors::Unimplemented( - "No converter registered for op: " + op); - } - OpConverter op_converter = op_registry_.at(op); std::vector outputs; - TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs)); + if (PluginFactoryTensorRT::GetInstance().IsPlugin(&op)) { + TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs)); + } else { + if (!op_registry_.count(op)) { + return tensorflow::errors::Unimplemented( + "No converter registered for op: " + op); + } + OpConverter op_converter = op_registry_.at(op); + TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs)); + } for (size_t i = 0; i < outputs.size(); ++i) { TRT_TensorOrWeights output = outputs.at(i); // TODO(jie): tf protobuf seems to be omitting the :0 suffix @@ -1158,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor( CHECK_EQ_TYPE(tensor_r->getType(), dtype); auto op_pair = ops.find(node_def.op()); if (op_pair == ops.end()) - return tensorflow::errors::Unimplemented( - "binary op: " + node_def.op() + - " not supported at: " + node_def.name()); + return tensorflow::errors::Unimplemented("binary op: " + node_def.op() + + " not supported at: " + + node_def.name()); nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise( *const_cast(tensor_l), @@ -1173,6 +1194,43 @@ tensorflow::Status BinaryTensorOpTensor( return tensorflow::Status::OK(); } +tensorflow::Status ConvertPlugin(Converter& ctx, + const tensorflow::NodeDef& node_def, + const std::vector& inputs, + std::vector* outputs) { + // prepare input + std::vector all_inputs; + for (auto input : inputs) { + all_inputs.emplace_back(const_cast(input.tensor())); + } + + // plugin is owned by PluginFactory + // TODO(jie): destroy plugins later (resource management) + PluginTensorRT* plugin = + PluginFactoryTensorRT::GetInstance().CreatePlugin(&node_def.op()); + + // passing attributes + // TODO(jie): support more general attribute + TFAttrs attrs(node_def); + auto attr_key_vector = attrs.GetAllAttrKey(); + for (auto attr_key : attr_key_vector) { + std::cout << attr_key << std::endl; + // TODO(jie): support only list of float for toy example here. + auto data = attrs.get>(attr_key); + size_t size_data = data.size() * sizeof(float); + plugin->SetAttribute(attr_key, static_cast(data.data()), size_data); + } + + nvinfer1::IPluginLayer* layer = + ctx.network()->addPlugin(&all_inputs[0], int(inputs.size()), *plugin); + + for (int i = 0; i < layer->getNbOutputs(); i++) { + nvinfer1::ITensor* output_tensor = layer->getOutput(i); + outputs->push_back(TRT_TensorOrWeights(output_tensor)); + } + return tensorflow::Status::OK(); +} + tensorflow::Status ConvertPlaceholder( Converter& ctx, const tensorflow::NodeDef& node_def, const std::vector& inputs, @@ -2073,6 +2131,8 @@ void Converter::register_op_converters() { op_registry_["Reshape"] = ConvertReshape; op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm; op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm; + + plugin_converter_ = ConvertPlugin; } } // namespace @@ -2511,7 +2571,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef( std::vector input_names; std::vector input_dtypes; for (const std::pair& input : s.input_inds) { - VLOG(2) << "parsing input. Node id= " << input.first ; + VLOG(2) << "parsing input. Node id= " << input.first; int node_id = input.first; int output_idx = input.second; tensorflow::Node* node = s.graph.FindNodeId(node_id); diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index b32371b642f..8881c48fe68 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include "tensorflow/contrib/tensorrt/log/trt_logger.h" #include "tensorflow/core/platform/logging.h" @@ -58,7 +59,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) { IRuntime* infer = nvinfer1::createInferRuntime(logger); trt_engine_ptr_.reset(infer->deserializeCudaEngine( - serialized_engine.c_str(), serialized_engine.size(), nullptr)); + serialized_engine.c_str(), serialized_engine.size(), + &PluginFactoryTensorRT::GetInstance())); trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext()); // Runtime is safe to delete after engine creation infer->destroy(); diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc new file mode 100644 index 00000000000..0e4a157d790 --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc @@ -0,0 +1,89 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h" +#include +#include +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { + +PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) { + // sanity check. + assert(EncodeOpName(GetPluginName()) != + *static_cast(serialized_data)); + const char* buffer = static_cast(serialized_data) + + sizeof(input_dim_list_.size()); + + size_t count = *reinterpret_cast(buffer); + buffer += sizeof(size_t); + + for (int i = 0; i < count; i++) { + nvinfer1::Dims dim; + std::memcpy(&(dim.nbDims), buffer, sizeof(dim.nbDims)); + buffer += sizeof(dim.nbDims); + std::memcpy(dim.d, buffer, sizeof(dim.d)); + buffer += sizeof(dim.d); + std::memcpy(dim.type, buffer, sizeof(dim.type)); + buffer += sizeof(dim.type); + input_dim_list_.emplace_back(dim); + } +} + +size_t PluginTensorRT::getSerializationSize() { + nvinfer1::Dims dim; + return sizeof(size_t) + sizeof(input_dim_list_.size()) + sizeof(dim.nbDims) + + sizeof(dim.d) + sizeof(dim.type); +} + +void PluginTensorRT::serialize(void* serialized_data) { + size_t encode_op_name = EncodeOpName(GetPluginName()); + char* buffer = static_cast(serialized_data); + std::memcpy(buffer, &encode_op_name, sizeof(size_t)); + buffer += sizeof(size_t); + + auto list_size = input_dim_list_.size(); + std::memcpy(buffer, &list_size, sizeof(input_dim_list_.size())); + buffer += sizeof(input_dim_list_.size()); + + for (int i = 0; i < input_dim_list_.size(); i++) { + auto dim = input_dim_list_[i]; + std::memcpy(buffer, &(dim.nbDims), sizeof(dim.nbDims)); + buffer += sizeof(dim.nbDims); + std::memcpy(buffer, dim.d, sizeof(dim.d)); + buffer += sizeof(dim.d); + std::memcpy(buffer, dim.type, sizeof(dim.type)); + buffer += sizeof(dim.type); + } +} + +bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr, + const size_t size) { + if (attr_map_.count(key) != 0) return false; + + attr_map_.emplace(key, std::vector(size)); + std::memcpy(attr_map_[key].data(), ptr, size); + return true; +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // GOOGLE_TENSORRT diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h new file mode 100644 index 00000000000..1bbfe62a4e6 --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h @@ -0,0 +1,81 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN +#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN + +#include +#include +#include +#include + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT +#include "tensorrt/include/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +using std::string; +using std::unordered_map; + +class PluginTensorRT : public nvinfer1::IPlugin { + public: + PluginTensorRT(){}; + PluginTensorRT(const void* serialized_data, size_t length); + // PluginTensorRT(const void* serialized_data, size_t length, size_t + // &incremental); + virtual string GetPluginName() = 0; + virtual bool Finalize() = 0; + + virtual bool SetAttribute(const string& key, const void* ptr, + const size_t size) = 0; + virtual bool GetAttribute(const string& key, const void* ptr, + size_t& size) = 0; + + void configure(const nvinfer1::Dims* inputs, int nbInputs, + const nvinfer1::Dims* outputs, int nbOutputs, + int maxBatchSize) override { + for (int index = 0; index < nbInputs; index++) { + nvinfer1::Dims dim; + dim.nbDims = inputs[index].nbDims; + for (int i = 0; i < dim.nbDims; i++) { + dim.d[i] = inputs[index].d[i]; + dim.type[i] = inputs[index].type[i]; + } + input_dim_list_.emplace_back(dim); + } + return; + } + + virtual bool StoreAttribute(const string& key, const void* ptr, + const size_t size); + + virtual size_t getSerializationSize() override; + virtual void serialize(void* buffer) override; + + protected: + std::unordered_map > attr_map_; + + std::vector input_dim_list_; +}; + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 00000000000..799c609a3eb --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,81 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layerName, + const void* serial_data, + size_t serial_length) { + size_t parsed_byte = 0; + // extract op_name from serial_data + size_t encoded_op_name = + ExtractOpName(serial_data, serial_length, parsed_byte); + + if (!IsPlugin(encoded_op_name)) { + return nullptr; + } + + // should I lock plugins here? + instance_m_.lock(); + auto plugin_ptr = + plugin_registry_[encoded_op_name].first(serial_data, serial_length); + // string op_name = "IncPluginTRT"; + // auto plugin_ptr = plugin_registry_[EncodeLayerName(&op_name)].second(); + // auto plugin_ptr = plugin_registry_.begin()->second.second(); + owned_plugins_.emplace_back(plugin_ptr); + instance_m_.unlock(); + + return plugin_ptr; +} + +PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string* op_name) { + if (!IsPlugin(op_name)) return nullptr; + + instance_m_.lock(); + auto plugin_ptr = plugin_registry_[EncodeLayerName(op_name)].second(); + owned_plugins_.emplace_back(plugin_ptr); + instance_m_.unlock(); + + return plugin_ptr; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const string* op_name, PluginDeserializeFunc deserialize_func, + PluginConstructFunc construct_func) { + if (IsPlugin(op_name)) return false; + + // get instance_m_ first before write to registry; + instance_m_.lock(); + auto ret = plugin_registry_.emplace( + EncodeLayerName(op_name), + std::make_pair(deserialize_func, construct_func)); + instance_m_.unlock(); + + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { return; } + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // GOOGLE_TENSORRT diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 00000000000..e68f4629d0c --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,83 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY +#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY + +#include +#include +#include +#include "trt_plugin.h" +#include "trt_plugin_utils.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT +#include "tensorrt/include/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { + public: + // deserialization method + // virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const void* + // serialData, size_t serialLength) override; + PluginTensorRT* createPlugin(const char* layerName, const void* serialData, + size_t serialLength) override; + + // construction + PluginTensorRT* CreatePlugin(const string* op_name); + + static PluginFactoryTensorRT& GetInstance() { + static PluginFactoryTensorRT factory_instance; + return factory_instance; + } + + bool RegisterPlugin(const string* op_name, + PluginDeserializeFunc deserialize_func, + PluginConstructFunc construct_func); + + bool IsPlugin(const size_t encode_name) { + return plugin_registry_.find(encode_name) != plugin_registry_.end(); + } + + bool IsPlugin(const string* op_name) { + return IsPlugin(EncodeLayerName(op_name)); + } + + size_t EncodeLayerName(const string* op_name) { + return EncodeOpName(*op_name); + } + + void DestroyPlugins(); + + protected: + std::unordered_map > + plugin_registry_; + + // TODO(jie): Owned plugin should be associated with different sessions; + // should really hand ownership of plugins to resource management; + std::vector > owned_plugins_; + std::mutex instance_m_; +}; + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc new file mode 100644 index 00000000000..b14480cfa67 --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc @@ -0,0 +1,36 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT + +namespace tensorflow { +namespace tensorrt { + +size_t ExtractOpName(const void* serial_data, size_t serial_length, + size_t& incremental) { + incremental = sizeof(size_t); + if (serial_length < incremental) return 0; + size_t encoded_op_name = *static_cast(serial_data); + return encoded_op_name; +} + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // GOOGLE_TENSORRT diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h new file mode 100644 index 00000000000..e9675d84cd3 --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h @@ -0,0 +1,51 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS +#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS + +#include +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT +#include "tensorrt/include/NvInfer.h" + +namespace tensorflow { +namespace tensorrt { + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + +inline size_t EncodeOpName(std::string str) { + return std::hash{}(str); +} + +// TODO(jie): work on error handling here +size_t ExtractOpName(const void* serial_data, size_t serial_length, + size_t& incremental); + +// size_t Deserialize(const char* serial_data, size_t serial_length, size_t +// &incremental); + +} // namespace tensorrt +} // namespace tensorflow + +#endif // GOOGLE_TENSORRT +#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc index 8b475177bc6..30b5616475e 100644 --- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc +++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h" +#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include #include @@ -33,7 +34,8 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) { TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine)); nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger); nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine( - serialized_engine.c_str(), serialized_engine.size(), nullptr); + serialized_engine.c_str(), serialized_engine.size(), + &tensorrt::PluginFactoryTensorRT::GetInstance()); int num_batch = -1; std::vector<::tensorflow::DataType> input_type; From 0cc518ee98d4caa154f8a7530cb971c00c610905 Mon Sep 17 00:00:00 2001 From: Michael Case Date: Wed, 11 Apr 2018 09:34:44 -0700 Subject: [PATCH 0078/1734] Fix Windows GPU TensorFlow Bazel builds. The configure.py script will error out on Windows GPU builds due to NCCL attempted to be configured (and is currently Linux only). PiperOrigin-RevId: 192461362 --- configure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configure.py b/configure.py index 81d5ad77ee4..8fb89791116 100644 --- a/configure.py +++ b/configure.py @@ -1516,7 +1516,8 @@ def main(): set_tf_cudnn_version(environ_cp) if is_linux(): set_tf_tensorrt_install_path(environ_cp) - set_tf_nccl_install_path(environ_cp) + set_tf_nccl_install_path(environ_cp) + set_tf_cuda_compute_capabilities(environ_cp) if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get( 'LD_LIBRARY_PATH') != '1': From 88fcde66561a8c7a869a4dc57003a30376c4b548 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Wed, 11 Apr 2018 16:23:10 -0700 Subject: [PATCH 0079/1734] Remove reference cycle checks from unit tests which touch uuid.uuid4() Should fix the release builds. They're failing because uuid4() creates reference cycles in Python 2.7.9 (2.7.11+ are fine). --- .../contrib/eager/python/checkpointable_utils_test.py | 8 ++++---- .../contrib/optimizer_v2/checkpointable_utils_test.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py index e6498ddb064..1dd0f21a077 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py +++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py @@ -116,7 +116,7 @@ class OnlyOneDep(checkpointable.Checkpointable): class SplitTests(test.TestCase): - @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True) + @test_util.run_in_graph_and_eager_modes() def testSaveRestoreSplitDep(self): save_checkpoint = checkpointable_utils.Checkpoint( dep=SaveTensorSlicesAsDeps()) @@ -390,7 +390,7 @@ class CheckpointingTests(test.TestCase): optimizer_node.slot_variables[0] .slot_variable_node_id].attributes[0].checkpoint_key) - @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True) + @test_util.run_in_graph_and_eager_modes() def testMoreComplexSaveableReturned(self): v = _OwnsMirroredVariables() checkpoint = checkpointable_utils.Checkpoint(v=v) @@ -976,7 +976,7 @@ class CheckpointingTests(test.TestCase): saver.save(checkpoint_prefix) self.assertEqual(before_ops, graph.get_operations()) - @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True) + @test_util.run_in_graph_and_eager_modes() def testCheckpointCleanup(self): checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") @@ -996,7 +996,7 @@ class CheckpointingTests(test.TestCase): expected_filenames, os.listdir(checkpoint_directory)) - @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True) + @test_util.run_in_graph_and_eager_modes() def testCheckpointCleanupChangingVarList(self): checkpoint_directory = self.get_temp_dir() checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py index 08f9699e850..d219795aa1e 100644 --- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py +++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py @@ -411,7 +411,7 @@ class CheckpointingTests(test.TestCase): optimizer.apply_gradients( [(g, v) for g, v in zip(grad, model.vars)]) - @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True) + @test_util.run_in_graph_and_eager_modes() def testDeferredSlotRestoration(self): checkpoint_directory = self.get_temp_dir() From e5e530f91aae3e8cd08a77487bb00d0630413e8a Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 17:51:26 -0700 Subject: [PATCH 0080/1734] Exclude cudnn_version_test from build in tf_stream_executor.cmake --- tensorflow/contrib/cmake/tf_stream_executor.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake index 91ca33f4c4d..2b32b22a719 100644 --- a/tensorflow/contrib/cmake/tf_stream_executor.cmake +++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake @@ -65,6 +65,10 @@ if (tensorflow_ENABLE_GPU) file(GLOB tf_stream_executor_gpu_srcs "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc" ) + file(GLOB tf_stream_executor_gpu_tests + "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc" + ) + list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests}) list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs}) endif() From a75a5e48a4f9240a02a45119e77b28363e772bef Mon Sep 17 00:00:00 2001 From: Jonathan Hseu Date: Wed, 11 Apr 2018 17:54:10 -0700 Subject: [PATCH 0081/1734] Improve comment --- tensorflow/contrib/lite/toco/model.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index 8a936842d90..d0ae8d389fd 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -151,9 +151,9 @@ enum class AxesOrder { }; // The type of the scalars in an array. -// Note that does not by itself tell whether the values in the array are -// real (are literally interpreted as real numbers) or quantized (only acquire -// a meaning as real numbers in conjunction with QuantizationParams). +// Note that the type does not by itself tell whether the values in the array +// are real (are literally interpreted as real numbers) or quantized (only +// acquire a meaning as real numbers in conjunction with QuantizationParams). // // In practice though: // float values are always real From 94768f9a886f85d2e147983907afffa57bc998ff Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 17:57:18 -0700 Subject: [PATCH 0082/1734] Exclude tests from tf_stream_executor build only if BUILD_CC_TESTS is OFF --- tensorflow/contrib/cmake/tf_stream_executor.cmake | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake index 2b32b22a719..eaae64e1c64 100644 --- a/tensorflow/contrib/cmake/tf_stream_executor.cmake +++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake @@ -65,10 +65,12 @@ if (tensorflow_ENABLE_GPU) file(GLOB tf_stream_executor_gpu_srcs "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc" ) - file(GLOB tf_stream_executor_gpu_tests - "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc" - ) - list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests}) + if (NOT tensorflow_BUILD_CC_TESTS) + file(GLOB tf_stream_executor_gpu_tests + "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc" + } + list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests}) + endif() list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs}) endif() From ffebc37eff2e44bbffa2964deeebb7fdaef2e219 Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 19:53:21 -0700 Subject: [PATCH 0083/1734] Build fixes --- tensorflow/c/c_api_experimental.cc | 2 +- tensorflow/contrib/cmake/tf_stream_executor.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 9678ee926fc..a1107709214 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -7088,7 +7088,7 @@ static std::vector CreateMNISTDatasetFunctions( status->status = tensorflow::errors::Unimplemented( "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " "is not implemented for Windows"); - return nullptr; + return std::vector(); #else const char* func_def = R"PREFIX( library { diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake index eaae64e1c64..af48ef1fd40 100644 --- a/tensorflow/contrib/cmake/tf_stream_executor.cmake +++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake @@ -68,7 +68,7 @@ if (tensorflow_ENABLE_GPU) if (NOT tensorflow_BUILD_CC_TESTS) file(GLOB tf_stream_executor_gpu_tests "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc" - } + ) list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests}) endif() list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs}) From 89987f232fd9ff3e6cdab43bc7056f55cb4adf8c Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 20:15:18 -0700 Subject: [PATCH 0084/1734] Added a TODO to cover CreateMNISTDatasetFunctions in Windows tests --- tensorflow/c/c_api_experimental.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index a1107709214..4883e616423 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -7085,6 +7085,7 @@ static std::vector CreateMNISTDatasetFunctions( const char* file_path, int batch_size, std::string* dataset_name, TF_Status* status) { #if defined(PLATFORM_WINDOWS) + // TODO(ashankar): cover CreateMNISTDatasetFunctions in Windows tests. status->status = tensorflow::errors::Unimplemented( "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " "is not implemented for Windows"); From f49a5f2aa35a16eab4625fdc4b2a0acef3933e34 Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 11 Apr 2018 21:42:48 -0700 Subject: [PATCH 0085/1734] Disable Grappler optimizer for tests --- tensorflow/python/framework/test_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index bf00fa6439b..990fa429a17 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -974,6 +974,8 @@ class TensorFlowTestCase(googletest.TestCase): config.graph_options.optimizer_options.opt_level = -1 config.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.OFF) + config.graph_options.rewrite_options.arithmetic_optimization = ( + rewriter_config_pb2.RewriterConfig.OFF) return config if graph is None: From 6ca5554b5a87cc5cb784d359ba03c5860ac8ead2 Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 12 Apr 2018 00:24:52 -0700 Subject: [PATCH 0086/1734] Trying to fix Windows release build for libtensorflow --- tensorflow/c/c_api_experimental.cc | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 4883e616423..073dc019c76 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -190,12 +190,6 @@ library { // be deleted by calling TF_DeleteFunction. static std::vector CreateImagenetDatasetFunctions( const char* file_path, std::string* dataset_name, TF_Status* status) { -#if defined(PLATFORM_WINDOWS) - status->status = tensorflow::errors::Unimplemented( - "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " - "is not implemented for Windows"); - return std::vector(); -#else const char* func_def = R"PREFIX( library { function { @@ -7074,7 +7068,6 @@ library { DCHECK(found); }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); -#endif } // On success, returns a set of TF_Function instances encoding a dataset @@ -7084,13 +7077,6 @@ library { static std::vector CreateMNISTDatasetFunctions( const char* file_path, int batch_size, std::string* dataset_name, TF_Status* status) { -#if defined(PLATFORM_WINDOWS) - // TODO(ashankar): cover CreateMNISTDatasetFunctions in Windows tests. - status->status = tensorflow::errors::Unimplemented( - "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " - "is not implemented for Windows"); - return std::vector(); -#else const char* func_def = R"PREFIX( library { function { @@ -8220,7 +8206,6 @@ library { DCHECK(found_batch_size); }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); -#endif } // Adds the input functions to `graph`. On success, returns the created @@ -8315,6 +8300,19 @@ TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph, TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets( TF_Graph* graph, const char* file_path, int batch_size, unsigned char is_mnist, TF_Status* status) { +#if defined(PLATFORM_WINDOWS) + // TODO(ashankar): get these functions working on Windows. + if (is_mnist) { + status->status = tensorflow::errors::Unimplemented( + "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " + "is not implemented for Windows"); + } else { + status->status = tensorflow::errors::Unimplemented( + "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " + "is not implemented for Windows"); + } + return nullptr +#else tensorflow::Status s; std::string dataset_name; @@ -8356,4 +8354,5 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets( << graph->graph.ToGraphDefDebug().DebugString(); return getnext_node; +#endif } From 2e0cc141b7925d9c9e4c359ccf56e7485623c483 Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 12 Apr 2018 00:31:20 -0700 Subject: [PATCH 0087/1734] Remove CreateImagenetDatasetFunctions and CreateMNISTDatasetFunctions on Windows --- tensorflow/c/c_api_experimental.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 073dc019c76..a4af0b721e3 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -7070,6 +7070,7 @@ library { return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); } +#if not defined(PLATFORM_WINDOWS) // On success, returns a set of TF_Function instances encoding a dataset // node stack that reads an MNIST file dataset from `file_path`, and // sets `dataset_name` to the created dataset name. The returned functions must @@ -8207,7 +8208,9 @@ library { }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); } +#endif +#if not defined(PLATFORM_WINDOWS) // Adds the input functions to `graph`. On success, returns the created // IteratorGetNext node. static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph( @@ -8272,6 +8275,7 @@ static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph( VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString(); return ToTF_Operation(getnext_node); } +#endif TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph, TF_Status* status) { From 9397987fe1fd8a632286fc1a2c2fe63bb8b4e26b Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 12 Apr 2018 00:39:45 -0700 Subject: [PATCH 0088/1734] Fix removing incorrect function --- tensorflow/c/c_api_experimental.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index a4af0b721e3..97ec09e2258 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -184,6 +184,7 @@ library { return std::move(functions[0]); } +#if not defined(PLATFORM_WINDOWS) // On success, returns a set of TF_Function instances encoding a dataset // node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and // sets `dataset_name` to the created dataset name. The returned functions must @@ -7069,6 +7070,7 @@ library { }; return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status); } +#endif #if not defined(PLATFORM_WINDOWS) // On success, returns a set of TF_Function instances encoding a dataset @@ -8210,7 +8212,6 @@ library { } #endif -#if not defined(PLATFORM_WINDOWS) // Adds the input functions to `graph`. On success, returns the created // IteratorGetNext node. static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph( @@ -8275,7 +8276,6 @@ static TF_Operation* AddDatasetFunctionAndIteratorNodesToGraph( VLOG(1) << "Output graph: " << graph->graph.ToGraphDefDebug().DebugString(); return ToTF_Operation(getnext_node); } -#endif TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph, TF_Status* status) { From e52563a43a286042142c98fa1900ed0015d45c3f Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 12 Apr 2018 08:48:19 -0700 Subject: [PATCH 0089/1734] Remove redundant if-statement --- tensorflow/c/c_api_experimental.cc | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 97ec09e2258..0c3bb680e75 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -8306,15 +8306,9 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets( unsigned char is_mnist, TF_Status* status) { #if defined(PLATFORM_WINDOWS) // TODO(ashankar): get these functions working on Windows. - if (is_mnist) { - status->status = tensorflow::errors::Unimplemented( - "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " - "is not implemented for Windows"); - } else { - status->status = tensorflow::errors::Unimplemented( - "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " - "is not implemented for Windows"); - } + status->status = tensorflow::errors::Unimplemented( + "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " + "is not implemented for Windows"); return nullptr #else tensorflow::Status s; From ef2111b8ba3016c958d496dbe541c5f7157b26a9 Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 12 Apr 2018 10:04:21 -0700 Subject: [PATCH 0090/1734] Install absl before building --- tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat index 97829892b10..3b437d3c58c 100644 --- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat +++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat @@ -31,6 +31,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog :: Set ctest binary location. IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe") +:: Install absl-py. +%PIP_EXE% install --upgrade absl-py + :: Run the CMAKE build to build the pip package. CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat if %errorlevel% neq 0 exit /b %errorlevel% @@ -40,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file set /p WHEEL_FILENAME= Date: Thu, 12 Apr 2018 10:13:06 -0700 Subject: [PATCH 0091/1734] Add missing semicolon --- tensorflow/c/c_api_experimental.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index 0c3bb680e75..581f5743eb7 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -8309,7 +8309,7 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets( status->status = tensorflow::errors::Unimplemented( "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API " "is not implemented for Windows"); - return nullptr + return nullptr; #else tensorflow::Status s; From a6bc4afc97ce7a2a285e549822d06f4cbf51c4ef Mon Sep 17 00:00:00 2001 From: Sami Kama Date: Fri, 13 Apr 2018 10:19:24 -0700 Subject: [PATCH 0092/1734] Cherry-picking PR #18444 into r1.8 --- tensorflow/contrib/tensorrt/BUILD | 2 +- .../contrib/tensorrt/resources/trt_resource_manager.cc | 6 ++++++ .../contrib/tensorrt/resources/trt_resource_manager.h | 6 +----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index 2f316767b35..fd3582e175e 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -52,7 +52,6 @@ tf_custom_op_library( "ops/trt_engine_op.cc", ], deps = [ - ":trt_engine_op_kernel", ":trt_shape_function", "//tensorflow/core:lib_proto_parsing", ] + if_tensorrt([ @@ -183,6 +182,7 @@ tf_py_wrap_cc( copts = tf_copts(), deps = [ ":trt_conversion", + ":trt_engine_op_kernel", "//tensorflow/core:framework_lite", "//util/python:python_headers", ], diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc index e663eed4dd6..9c3698e5d1c 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc +++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc @@ -19,6 +19,12 @@ limitations under the License. namespace tensorflow { namespace tensorrt { +std::shared_ptr +tensorflow::tensorrt::TRTResourceManager::instance() { + static std::shared_ptr instance_(new TRTResourceManager); + return instance_; +} + std::shared_ptr tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) { // mutex is held for lookup only. Most instantiations where mutex will be held diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h index 5f8ad491d3c..bc15b51e05e 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h +++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h @@ -29,11 +29,7 @@ class TRTResourceManager { TRTResourceManager() = default; public: - static std::shared_ptr instance() { - static std::shared_ptr instance_( - new TRTResourceManager); - return instance_; - } + static std::shared_ptr instance(); // returns a manager for given op, if it doesn't exists it creates one std::shared_ptr getManager(const string& op_name); From 8303fa2a53071a7e4a346454f707d25abbd6e1b5 Mon Sep 17 00:00:00 2001 From: James Wexler Date: Fri, 13 Apr 2018 13:33:37 -0400 Subject: [PATCH 0093/1734] closure proto library for example protos --- WORKSPACE | 19 ++++++++++++------- tensorflow/core/BUILD | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 11c5cdb2070..d37e2139225 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,13 +1,18 @@ workspace(name = "org_tensorflow") -http_archive( +## DO NOT SUBMIT +#http_archive( +# name = "io_bazel_rules_closure", +# sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657", +# strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f", +# urls = [ +# "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", +# "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", # 2018-01-16 +# ], +#) +local_repository( name = "io_bazel_rules_closure", - sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657", - strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f", - urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", - "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", # 2018-01-16 - ], + path = "/usr/local/google/home/jwexler/jameswex/rules_closure", ) load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories") diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index c5ca421ced2..08884fa9142 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -149,6 +149,7 @@ load( "//third_party/mkl:build_defs.bzl", "if_mkl", ) +load("@io_bazel_rules_closure//closure:defs.bzl","closure_proto_library") exports_files(["ops/ops.pbtxt"]) @@ -244,6 +245,21 @@ tf_nano_proto_library( deps = [":protos_all_cc"], ) +proto_library( + name = "example_protos", + srcs = [ + "example/example.proto", + "example/feature.proto", + ], + visibility = ["//visibility:public"], +) + +closure_proto_library( + name = "example_protos_closure", + deps = [":example_protos"], + visibility = ["//visibility:public"], +) + exports_files([ "framework/types.proto", ]) From 4fa6ca2bb74aa27ffb71a23e4a8d72810c377b07 Mon Sep 17 00:00:00 2001 From: James Wexler Date: Fri, 13 Apr 2018 14:09:42 -0400 Subject: [PATCH 0094/1734] review changes --- WORKSPACE | 19 +++++++------------ tensorflow/core/BUILD | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index d37e2139225..4ddfb9a3832 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,18 +1,13 @@ workspace(name = "org_tensorflow") -## DO NOT SUBMIT -#http_archive( -# name = "io_bazel_rules_closure", -# sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657", -# strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f", -# urls = [ -# "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", -# "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz", # 2018-01-16 -# ], -#) -local_repository( +http_archive( name = "io_bazel_rules_closure", - path = "/usr/local/google/home/jwexler/jameswex/rules_closure", + sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae", + strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1", + urls = [ + "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz", + "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz", # 2018-04-13 + ], ) load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories") diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 08884fa9142..ab25283cc44 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -149,7 +149,7 @@ load( "//third_party/mkl:build_defs.bzl", "if_mkl", ) -load("@io_bazel_rules_closure//closure:defs.bzl","closure_proto_library") +load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library") exports_files(["ops/ops.pbtxt"]) From 8e2fd4b30210ef633153b65d3d45cc51a3d4f0cf Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 13 Apr 2018 11:09:58 -0700 Subject: [PATCH 0095/1734] Use eager compatible wrappers in load_library for custom ops --- tensorflow/python/BUILD | 1 + tensorflow/python/framework/load_library.py | 2 +- tensorflow/python/framework/python_op_gen.i | 8 ++-- .../tools/ci_build/builds/test_user_ops.sh | 39 +++++++++++-------- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index db17a3fe023..9209ca4b96b 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3286,6 +3286,7 @@ tf_py_wrap_cc( "//tensorflow/core/profiler/internal:print_model_analysis", "//tensorflow/tools/graph_transforms:transform_graph_lib", "//tensorflow/python/eager:pywrap_tfe_lib", + "//tensorflow/python/eager:python_eager_op_gen", "//util/python:python_headers", ] + (tf_additional_lib_deps() + tf_additional_plugin_deps() + diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py index 1f2aa264c11..4f349304d34 100644 --- a/tensorflow/python/framework/load_library.py +++ b/tensorflow/python/framework/load_library.py @@ -60,7 +60,7 @@ def load_op_library(library_filename): op_list_str = py_tf.TF_GetOpList(lib_handle) op_list = op_def_pb2.OpList() op_list.ParseFromString(compat.as_bytes(op_list_str)) - wrappers = py_tf.GetPythonWrappers(op_list_str) + wrappers = py_tf.GetEagerPythonWrappers(op_list_str) # Delete the library handle to release any memory held in C # that are no longer needed. diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i index 26ec4e8e66b..e39c425b050 100644 --- a/tensorflow/python/framework/python_op_gen.i +++ b/tensorflow/python/framework/python_op_gen.i @@ -16,10 +16,10 @@ limitations under the License. %include "tensorflow/python/platform/base.i" %{ -#include "tensorflow/python/framework/python_op_gen.h" +#include "tensorflow/python/eager/python_eager_op_gen.h" %} -// Input typemap for GetPythonWrappers. +// Input typemap for GetEagerPythonWrappers. // Accepts a python object of 'bytes' type, and converts it to // a const char* pointer and size_t length. The default typemap // going from python bytes to const char* tries to decode the @@ -37,5 +37,5 @@ limitations under the License. %ignoreall; -%unignore tensorflow::GetPythonWrappers; -%include "tensorflow/python/framework/python_op_gen.h" +%unignore tensorflow::GetEagerPythonWrappers; +%include "third_party/tensorflow/python/eager/python_eager_op_gen.h" diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh index caa3a40817c..c342367bace 100755 --- a/tensorflow/tools/ci_build/builds/test_user_ops.sh +++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh @@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//') echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\ "via pip installation" -ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +function run_op() { + local ORIG_OUTPUT=$1 + local ADDITIONAL_LOG=$2 -# Format OUTPUT for analysis -if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then - if [[ ${IS_MAC} == "1" ]]; then - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + # Format OUTPUT for analysis + if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then + if [[ ${IS_MAC} == "1" ]]; then + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + else + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + fi else - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + local OUTPUT="${ORIG_OUTPUT}" fi -else - OUTPUT="${ORIG_OUTPUT}" -fi -EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") + local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") -if [[ "${EQUALS_EXPECTED}" != "True" ]]; then - die "FAILED: Output from user op (${OUTPUT}) does not match expected "\ -"output ${EXPECTED_OUTPUT}" -else - echo "Output from user op (${OUTPUT}) matches expected output" -fi + if [[ "${EQUALS_EXPECTED}" != "True" ]]; then + local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\ + "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG} + die ${ERROR} + else + echo "Output from user op (${OUTPUT}) matches expected output" + fi +} + +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode" popd From 6942b87c255e9bce9289f87ff6894d198fcab6f4 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 13 Apr 2018 11:09:58 -0700 Subject: [PATCH 0096/1734] Use eager compatible wrappers in load_library for custom ops --- tensorflow/python/BUILD | 1 + tensorflow/python/framework/load_library.py | 2 +- tensorflow/python/framework/python_op_gen.i | 8 ++-- .../tools/ci_build/builds/test_user_ops.sh | 39 +++++++++++-------- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index a683c8cfa66..579a8faaad6 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3482,6 +3482,7 @@ tf_py_wrap_cc( "//tensorflow/core/profiler/internal:print_model_analysis", "//tensorflow/tools/graph_transforms:transform_graph_lib", "//tensorflow/python/eager:pywrap_tfe_lib", + "//tensorflow/python/eager:python_eager_op_gen", "//util/python:python_headers", ] + (tf_additional_lib_deps() + tf_additional_plugin_deps() + diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py index 535c6017f5f..9a8477debb0 100644 --- a/tensorflow/python/framework/load_library.py +++ b/tensorflow/python/framework/load_library.py @@ -58,7 +58,7 @@ def load_op_library(library_filename): op_list_str = py_tf.TF_GetOpList(lib_handle) op_list = op_def_pb2.OpList() op_list.ParseFromString(compat.as_bytes(op_list_str)) - wrappers = py_tf.GetPythonWrappers(op_list_str) + wrappers = py_tf.GetEagerPythonWrappers(op_list_str) # Delete the library handle to release any memory held in C # that are no longer needed. diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i index 26ec4e8e66b..e39c425b050 100644 --- a/tensorflow/python/framework/python_op_gen.i +++ b/tensorflow/python/framework/python_op_gen.i @@ -16,10 +16,10 @@ limitations under the License. %include "tensorflow/python/platform/base.i" %{ -#include "tensorflow/python/framework/python_op_gen.h" +#include "tensorflow/python/eager/python_eager_op_gen.h" %} -// Input typemap for GetPythonWrappers. +// Input typemap for GetEagerPythonWrappers. // Accepts a python object of 'bytes' type, and converts it to // a const char* pointer and size_t length. The default typemap // going from python bytes to const char* tries to decode the @@ -37,5 +37,5 @@ limitations under the License. %ignoreall; -%unignore tensorflow::GetPythonWrappers; -%include "tensorflow/python/framework/python_op_gen.h" +%unignore tensorflow::GetEagerPythonWrappers; +%include "third_party/tensorflow/python/eager/python_eager_op_gen.h" diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh index caa3a40817c..c342367bace 100755 --- a/tensorflow/tools/ci_build/builds/test_user_ops.sh +++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh @@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//') echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\ "via pip installation" -ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +function run_op() { + local ORIG_OUTPUT=$1 + local ADDITIONAL_LOG=$2 -# Format OUTPUT for analysis -if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then - if [[ ${IS_MAC} == "1" ]]; then - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + # Format OUTPUT for analysis + if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then + if [[ ${IS_MAC} == "1" ]]; then + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + else + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + fi else - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + local OUTPUT="${ORIG_OUTPUT}" fi -else - OUTPUT="${ORIG_OUTPUT}" -fi -EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") + local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") -if [[ "${EQUALS_EXPECTED}" != "True" ]]; then - die "FAILED: Output from user op (${OUTPUT}) does not match expected "\ -"output ${EXPECTED_OUTPUT}" -else - echo "Output from user op (${OUTPUT}) matches expected output" -fi + if [[ "${EQUALS_EXPECTED}" != "True" ]]; then + local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\ + "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG} + die ${ERROR} + else + echo "Output from user op (${OUTPUT}) matches expected output" + fi +} + +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode" popd From 988ad74476250eee70227349b5f1eabc86d22833 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 13 Apr 2018 11:29:31 -0700 Subject: [PATCH 0097/1734] Not in third_party --- tensorflow/python/framework/python_op_gen.i | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i index e39c425b050..efcce2f2094 100644 --- a/tensorflow/python/framework/python_op_gen.i +++ b/tensorflow/python/framework/python_op_gen.i @@ -38,4 +38,4 @@ limitations under the License. %ignoreall; %unignore tensorflow::GetEagerPythonWrappers; -%include "third_party/tensorflow/python/eager/python_eager_op_gen.h" +%include "tensorflow/python/eager/python_eager_op_gen.h" From 7e0db0fe4992c466f758338183dfa0636c61a36b Mon Sep 17 00:00:00 2001 From: James Wexler Date: Fri, 13 Apr 2018 15:18:17 -0400 Subject: [PATCH 0098/1734] fix build file format --- tensorflow/core/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ab25283cc44..46da23f6f96 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -256,8 +256,8 @@ proto_library( closure_proto_library( name = "example_protos_closure", - deps = [":example_protos"], visibility = ["//visibility:public"], + deps = [":example_protos"], ) exports_files([ From 76a73f899cdc5e19ef2b99373524dcb4dba0bd2b Mon Sep 17 00:00:00 2001 From: Younghee Kwon Date: Mon, 9 Apr 2018 17:45:13 -0700 Subject: [PATCH 0099/1734] boosted_trees: early stop hooks are fixed to stop at the right moment by reading tensor values in a separate session after train_op run. PiperOrigin-RevId: 192217338 --- .../python/estimator/boosted_trees_test.py | 97 +++++++------------ .../python/estimator/canned/boosted_trees.py | 33 +++---- .../estimator/canned/boosted_trees_test.py | 63 +++++------- 3 files changed, 71 insertions(+), 122 deletions(-) diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py index e99a87f3b3c..eee59106876 100644 --- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py +++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py @@ -20,6 +20,7 @@ from __future__ import print_function import numpy as np from tensorflow.contrib.estimator.python.estimator import boosted_trees +from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2 from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees from tensorflow.python.estimator.inputs import numpy_io from tensorflow.python.feature_column import feature_column @@ -69,10 +70,18 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase): for i in range(NUM_FEATURES) } - def _assert_checkpoint(self, model_dir, expected_global_step): - self.assertEqual(expected_global_step, - checkpoint_utils.load_variable(model_dir, - ops.GraphKeys.GLOBAL_STEP)) + def _assert_checkpoint(self, model_dir, global_step, finalized_trees, + attempted_layers): + reader = checkpoint_utils.load_checkpoint(model_dir) + self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP)) + serialized = reader.get_tensor('boosted_trees:0_serialized') + ensemble_proto = boosted_trees_pb2.TreeEnsemble() + ensemble_proto.ParseFromString(serialized) + self.assertEqual( + finalized_trees, + sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized])) + self.assertEqual(attempted_layers, + ensemble_proto.growing_metadata.num_layers_attempted) def testTrainAndEvaluateEstimator(self): input_fn = _make_train_input_fn(is_classification=False) @@ -88,9 +97,10 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase): num_steps = 100 # Train for a few steps, and validate final checkpoint. est.train(input_fn, steps=num_steps) - self._assert_checkpoint(est.model_dir, 11) + self._assert_checkpoint( + est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10) eval_res = est.evaluate(input_fn=input_fn, steps=1) - self.assertAllClose(eval_res['average_loss'], 0.913176) + self.assertAllClose(eval_res['average_loss'], 1.008551) def testInferEstimator(self): train_input_fn = _make_train_input_fn(is_classification=False) @@ -108,31 +118,13 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase): num_steps = 100 # Train for a few steps, and validate final checkpoint. est.train(train_input_fn, steps=num_steps) - self._assert_checkpoint(est.model_dir, 6) - + self._assert_checkpoint( + est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5) + # Validate predictions. predictions = list(est.predict(input_fn=predict_input_fn)) - self.assertEquals(5, len(predictions)) - self.assertAllClose([0.703549], predictions[0]['predictions']) - self.assertAllClose([0.266539], predictions[1]['predictions']) - self.assertAllClose([0.256479], predictions[2]['predictions']) - self.assertAllClose([1.088732], predictions[3]['predictions']) - self.assertAllClose([1.901732], predictions[4]['predictions']) - - -class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase): - - def setUp(self): - self._feature_columns = { - feature_column.bucketized_column( - feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), - BUCKET_BOUNDARIES) - for i in range(NUM_FEATURES) - } - - def _assert_checkpoint(self, model_dir, expected_global_step): - self.assertEqual(expected_global_step, - checkpoint_utils.load_variable(model_dir, - ops.GraphKeys.GLOBAL_STEP)) + self.assertAllClose( + [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]], + [pred['predictions'] for pred in predictions]) def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self): train_input_fn = _make_train_input_fn(is_classification=True) @@ -145,36 +137,16 @@ class BoostedTreesClassifierTrainInMemoryTest(test_util.TensorFlowTestCase): n_trees=1, max_depth=5) # It will stop after 5 steps because of the max depth and num trees. - self._assert_checkpoint(est.model_dir, 6) + self._assert_checkpoint( + est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5) # Check eval. eval_res = est.evaluate(input_fn=train_input_fn, steps=1) self.assertAllClose(eval_res['accuracy'], 1.0) - - # Check predict that all labels are correct. + # Validate predictions. predictions = list(est.predict(input_fn=predict_input_fn)) - self.assertEquals(5, len(predictions)) - self.assertAllClose([0], predictions[0]['class_ids']) - self.assertAllClose([1], predictions[1]['class_ids']) - self.assertAllClose([1], predictions[2]['class_ids']) - self.assertAllClose([0], predictions[3]['class_ids']) - self.assertAllClose([0], predictions[4]['class_ids']) - - -class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase): - - def setUp(self): - self._feature_columns = { - feature_column.bucketized_column( - feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), - BUCKET_BOUNDARIES) - for i in range(NUM_FEATURES) - } - - def _assert_checkpoint(self, model_dir, expected_global_step): - self.assertEqual(expected_global_step, - checkpoint_utils.load_variable(model_dir, - ops.GraphKeys.GLOBAL_STEP)) + self.assertAllClose([[0], [1], [1], [0], [0]], + [pred['class_ids'] for pred in predictions]) def testRegressorTrainInMemoryAndEvalAndInfer(self): train_input_fn = _make_train_input_fn(is_classification=False) @@ -187,20 +159,17 @@ class BoostedTreesRegressorTrainInMemoryTest(test_util.TensorFlowTestCase): n_trees=1, max_depth=5) # It will stop after 5 steps because of the max depth and num trees. - self._assert_checkpoint(est.model_dir, 6) + self._assert_checkpoint( + est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5) # Check eval. eval_res = est.evaluate(input_fn=train_input_fn, steps=1) - self.assertAllClose(eval_res['average_loss'], 2.2136638) - + self.assertAllClose(eval_res['average_loss'], 2.478283) # Validate predictions. predictions = list(est.predict(input_fn=predict_input_fn)) - self.assertEquals(5, len(predictions)) - self.assertAllClose([0.703549], predictions[0]['predictions']) - self.assertAllClose([0.266539], predictions[1]['predictions']) - self.assertAllClose([0.256479], predictions[2]['predictions']) - self.assertAllClose([1.088732], predictions[3]['predictions']) - self.assertAllClose([1.901732], predictions[4]['predictions']) + self.assertAllClose( + [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]], + [pred['predictions'] for pred in predictions]) if __name__ == '__main__': diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py index 500ea03ea7f..c5d5455b1a3 100644 --- a/tensorflow/python/estimator/canned/boosted_trees.py +++ b/tensorflow/python/estimator/canned/boosted_trees.py @@ -209,8 +209,8 @@ class _CacheTrainingStatesUsingVariables(object): name='cache_insert') -class StopAtAttemptsHook(session_run_hook.SessionRunHook): - """Hook that requests stop at the number of trees.""" +class _StopAtAttemptsHook(session_run_hook.SessionRunHook): + """Hook that requests stop at the number of attempts.""" def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor, max_trees, max_depth): @@ -224,25 +224,17 @@ class StopAtAttemptsHook(session_run_hook.SessionRunHook): [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor]) def after_run(self, run_context, run_values): + # num_* tensors should be retrieved by a separate session than the training + # one, in order to read the values after growing. + # So, if it's approaching to the limit, get the actual value by additional + # session. num_finalized_trees, num_attempted_layers = run_values.results + if (num_finalized_trees >= self._max_trees - 1 or + num_attempted_layers > 2 * self._max_trees * self._max_depth - 1): + num_finalized_trees, num_attempted_layers = run_context.session.run( + [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor]) if (num_finalized_trees >= self._max_trees or - 1.0 * num_attempted_layers / self._max_depth > 2 * self._max_trees): - run_context.request_stop() - - -class StopAtNumTreesHook(session_run_hook.SessionRunHook): - """Hook that requests stop at the number of trees.""" - - def __init__(self, num_trees_tensor, max_trees): - self._num_trees_tensor = num_trees_tensor - self._max_trees = max_trees - - def before_run(self, run_context): - return session_run_hook.SessionRunArgs(self._num_trees_tensor) - - def after_run(self, run_context, run_values): - num_trees = run_values.results - if num_trees > self._max_trees: + num_attempted_layers > 2 * self._max_trees * self._max_depth): run_context.request_stop() @@ -468,7 +460,8 @@ def _bt_model_fn( # Add an early stop hook. estimator_spec = estimator_spec._replace( training_hooks=estimator_spec.training_hooks + - (StopAtNumTreesHook(num_trees, tree_hparams.n_trees),)) + (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers, + tree_hparams.n_trees, tree_hparams.max_depth),)) return estimator_spec diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py index 01e5cc7a5d6..625745a3f97 100644 --- a/tensorflow/python/estimator/canned/boosted_trees_test.py +++ b/tensorflow/python/estimator/canned/boosted_trees_test.py @@ -69,7 +69,7 @@ def _make_train_input_fn(is_classification): return _input_fn -class BoostedTreesClassifierTest(test_util.TensorFlowTestCase): +class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase): def setUp(self): self._feature_columns = { @@ -79,10 +79,18 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase): for i in range(NUM_FEATURES) } - def _assert_checkpoint(self, model_dir, expected_global_step): - self.assertEqual(expected_global_step, - checkpoint_utils.load_variable(model_dir, - ops.GraphKeys.GLOBAL_STEP)) + def _assert_checkpoint(self, model_dir, global_step, finalized_trees, + attempted_layers): + reader = checkpoint_utils.load_checkpoint(model_dir) + self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP)) + serialized = reader.get_tensor('boosted_trees:0_serialized') + ensemble_proto = boosted_trees_pb2.TreeEnsemble() + ensemble_proto.ParseFromString(serialized) + self.assertEqual( + finalized_trees, + sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized])) + self.assertEqual(attempted_layers, + ensemble_proto.growing_metadata.num_layers_attempted) def testTrainAndEvaluateBinaryClassifier(self): input_fn = _make_train_input_fn(is_classification=True) @@ -97,7 +105,8 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase): num_steps = 100 # Train for a few steps, and validate final checkpoint. est.train(input_fn, steps=num_steps) - self._assert_checkpoint(est.model_dir, 6) + self._assert_checkpoint( + est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5) eval_res = est.evaluate(input_fn=input_fn, steps=1) self.assertAllClose(eval_res['accuracy'], 1.0) @@ -118,29 +127,9 @@ class BoostedTreesClassifierTest(test_util.TensorFlowTestCase): est.train(train_input_fn, steps=num_steps) predictions = list(est.predict(input_fn=predict_input_fn)) - self.assertEquals(5, len(predictions)) # All labels are correct. - self.assertAllClose([0], predictions[0]['class_ids']) - self.assertAllClose([1], predictions[1]['class_ids']) - self.assertAllClose([1], predictions[2]['class_ids']) - self.assertAllClose([0], predictions[3]['class_ids']) - self.assertAllClose([0], predictions[4]['class_ids']) - - -class BoostedTreesRegressionTest(test_util.TensorFlowTestCase): - - def setUp(self): - self._feature_columns = { - feature_column.bucketized_column( - feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), - BUCKET_BOUNDARIES) - for i in range(NUM_FEATURES) - } - - def _assert_checkpoint(self, model_dir, expected_global_step): - self.assertEqual(expected_global_step, - checkpoint_utils.load_variable(model_dir, - ops.GraphKeys.GLOBAL_STEP)) + self.assertAllClose([[0], [1], [1], [0], [0]], + [pred['class_ids'] for pred in predictions]) def testTrainAndEvaluateRegressor(self): input_fn = _make_train_input_fn(is_classification=False) @@ -155,9 +144,10 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase): num_steps = 100 # Train for a few steps, and validate final checkpoint. est.train(input_fn, steps=num_steps) - self._assert_checkpoint(est.model_dir, 11) + self._assert_checkpoint( + est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10) eval_res = est.evaluate(input_fn=input_fn, steps=1) - self.assertAllClose(eval_res['average_loss'], 0.913176) + self.assertAllClose(eval_res['average_loss'], 1.008551) def testInferRegressor(self): train_input_fn = _make_train_input_fn(is_classification=False) @@ -174,16 +164,13 @@ class BoostedTreesRegressionTest(test_util.TensorFlowTestCase): num_steps = 100 # Train for a few steps, and validate final checkpoint. est.train(train_input_fn, steps=num_steps) - self._assert_checkpoint(est.model_dir, 6) + self._assert_checkpoint( + est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5) predictions = list(est.predict(input_fn=predict_input_fn)) - - self.assertEquals(5, len(predictions)) - self.assertAllClose([0.703549], predictions[0]['predictions']) - self.assertAllClose([0.266539], predictions[1]['predictions']) - self.assertAllClose([0.256479], predictions[2]['predictions']) - self.assertAllClose([1.088732], predictions[3]['predictions']) - self.assertAllClose([1.901732], predictions[4]['predictions']) + self.assertAllClose( + [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]], + [pred['predictions'] for pred in predictions]) class ModelFnTests(test_util.TensorFlowTestCase): From 3e1739c0c3c6cd3b74879f3e1872dd1354401e56 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 11 Apr 2018 15:37:49 -0700 Subject: [PATCH 0100/1734] Revealing the range of node ids in the latest layer via resource' state PiperOrigin-RevId: 192520351 --- ...tedTreesCalculateBestGainsPerFeature.pbtxt | 4 +- ...pi_def_BoostedTreesGetEnsembleStates.pbtxt | 12 +++++- .../kernels/boosted_trees/boosted_trees.proto | 4 ++ .../kernels/boosted_trees/resource_ops.cc | 12 ++++++ .../core/kernels/boosted_trees/resources.h | 20 ++++++++++ .../core/kernels/boosted_trees/stats_ops.cc | 6 +-- .../kernels/boosted_trees/training_ops.cc | 8 ++++ tensorflow/core/ops/boosted_trees_ops.cc | 2 + .../core/ops/compat/ops_history.v1.pbtxt | 4 ++ .../python/estimator/canned/boosted_trees.py | 9 ++--- .../estimator/canned/boosted_trees_test.py | 12 ++++++ .../boosted_trees/resource_ops_test.py | 31 +++++++++----- .../boosted_trees/stats_ops_test.py | 8 ++-- .../boosted_trees/training_ops_test.py | 40 +++++++++++++++++-- tensorflow/python/ops/boosted_trees_ops.py | 15 ++++--- 15 files changed, 150 insertions(+), 37 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt index b1921e3507b..62876a293c1 100644 --- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt @@ -4,7 +4,7 @@ op { in_arg { name: "node_id_range" description: <allocate_output(0, TensorShape(), &output_stamp_token_t)); @@ -110,11 +111,22 @@ class BoostedTreesGetEnsembleStatesOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(3, TensorShape(), &output_num_attempted_layers_t)); + OP_REQUIRES_OK(context, context->allocate_output( + 4, {2}, &output_last_layer_nodes_range_t)); output_stamp_token_t->scalar()() = tree_ensemble_resource->stamp(); output_num_trees_t->scalar()() = num_trees; output_num_finalized_trees_t->scalar()() = num_finalized_trees; output_num_attempted_layers_t->scalar()() = num_attempted_layers; + + int32 range_start; + int32 range_end; + tree_ensemble_resource->GetLastLayerNodesRange(&range_start, &range_end); + + output_last_layer_nodes_range_t->vec()(0) = range_start; + // For a completely empty ensemble, this will be 0. To make it a valid range + // we add this max cond. + output_last_layer_nodes_range_t->vec()(1) = std::max(1, range_end); } }; diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h index c82588b9507..561ca3a18a7 100644 --- a/tensorflow/core/kernels/boosted_trees/resources.h +++ b/tensorflow/core/kernels/boosted_trees/resources.h @@ -93,6 +93,26 @@ class BoostedTreesEnsembleResource : public StampedResource { new_num_layers); } + void UpdateLastLayerNodesRange(const int32 node_range_start, + int32 node_range_end) const { + tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start( + node_range_start); + tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end( + node_range_end); + } + + void GetLastLayerNodesRange(int32* node_range_start, + int32* node_range_end) const { + *node_range_start = + tree_ensemble_->growing_metadata().last_layer_node_start(); + *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end(); + } + + int64 GetNumNodes(const int32 tree_id) { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->trees(tree_id).nodes_size(); + } + void UpdateGrowingMetadata() const; int32 GetNumLayersAttempted() { diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc index 33fdab6a860..16e65cf2843 100644 --- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc +++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc @@ -42,8 +42,8 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel { const Tensor* node_id_range_t; OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t)); const auto node_id_range = node_id_range_t->vec(); - int32 node_id_first = node_id_range(0); - int32 node_id_last = node_id_range(1); // inclusive. + const int32 node_id_first = node_id_range(0); // inclusive + const int32 node_id_last = node_id_range(1); // exclusive // stats_summary_list OpInputList stats_summary_list; OP_REQUIRES_OK(context, context->input_list("stats_summary_list", @@ -86,7 +86,7 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel { std::vector output_thresholds; std::vector output_left_node_contribs; std::vector output_right_node_contribs; - for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) { + for (int node_id = node_id_first; node_id < node_id_last; ++node_id) { // Calculate gains. cum_grad.clear(); cum_hess.clear(); diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc index b9ded4054ac..67cac14c520 100644 --- a/tensorflow/core/kernels/boosted_trees/training_ops.cc +++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc @@ -101,6 +101,7 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel { << current_tree << " of ensemble of " << current_tree + 1 << " trees."; bool split_happened = false; + int32 node_id_start = ensemble_resource->GetNumNodes(current_tree); // Add the splits to the tree. for (auto& split_entry : best_splits) { const int32 node_id = split_entry.first; @@ -139,11 +140,15 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel { right_contrib, &left_node_id, &right_node_id); split_happened = true; } + int32 node_id_end = ensemble_resource->GetNumNodes(current_tree); if (split_happened) { // Update growable tree metadata. ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers); // Finalize the tree if needed. if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) { + // If the tree is finalized, next growing will start from node 0; + node_id_start = 0; + node_id_end = 1; ensemble_resource->SetIsFinalized(current_tree, true); if (pruning_mode_ == kPostPruning) { ensemble_resource->PostPruneTree(current_tree); @@ -153,6 +158,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel { ensemble_resource->AddNewTree(kLayerByLayerTreeWeight); } } + // If we managed to split, update the node range. If we didn't, don't + // update as we will try to split the same nodes with new instances. + ensemble_resource->UpdateLastLayerNodesRange(node_id_start, node_id_end); } } diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc index 297e94655fe..8af49034189 100644 --- a/tensorflow/core/ops/boosted_trees_ops.cc +++ b/tensorflow/core/ops/boosted_trees_ops.cc @@ -128,6 +128,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates") .Output("num_trees: int32") .Output("num_finalized_trees: int32") .Output("num_attempted_layers: int32") + .Output("last_layer_nodes_range: int32") .SetShapeFn([](shape_inference::InferenceContext* c) { shape_inference::ShapeHandle unused_input; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input)); @@ -135,6 +136,7 @@ REGISTER_OP("BoostedTreesGetEnsembleStates") c->set_output(1, c->Scalar()); c->set_output(2, c->Scalar()); c->set_output(3, c->Scalar()); + c->set_output(4, c->Vector(2)); return Status::OK(); }); diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 026bfa89cfb..2f6f588d2c3 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -10861,6 +10861,10 @@ op { name: "num_attempted_layers" type: DT_INT32 } + output_arg { + name: "last_layer_nodes_range" + type: DT_INT32 + } is_stateful: true } op { diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py index c5d5455b1a3..58af59dbb17 100644 --- a/tensorflow/python/estimator/canned/boosted_trees.py +++ b/tensorflow/python/estimator/canned/boosted_trees.py @@ -349,8 +349,8 @@ def _bt_model_fn( array_ops.zeros( [batch_size, head.logits_dimension], dtype=dtypes.float32)) with ops.control_dependencies([ensemble_reload]): - (stamp_token, num_trees, num_finalized_trees, - num_attempted_layers) = local_tree_ensemble.get_states() + (stamp_token, num_trees, num_finalized_trees, num_attempted_layers, + last_layer_nodes_range) = local_tree_ensemble.get_states() summary.scalar('ensemble/num_trees', num_trees) summary.scalar('ensemble/num_finalized_trees', num_finalized_trees) summary.scalar('ensemble/num_attempted_layers', num_attempted_layers) @@ -393,10 +393,7 @@ def _bt_model_fn( (node_ids_per_feature, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( - node_id_range=array_ops.stack([ - math_ops.reduce_min(node_ids), - math_ops.reduce_max(node_ids) - ]), + node_id_range=last_layer_nodes_range, stats_summary_list=stats_summary_list, l1=tree_hparams.l1, l2=tree_hparams.l2, diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py index 625745a3f97..7823ef84100 100644 --- a/tensorflow/python/estimator/canned/boosted_trees_test.py +++ b/tensorflow/python/estimator/canned/boosted_trees_test.py @@ -223,6 +223,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ second_round = """ @@ -307,6 +309,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ third_round = """ @@ -407,6 +411,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 2 num_layers_attempted: 3 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ return (first_round, second_round, third_round) @@ -444,6 +450,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ second_round = """ @@ -528,6 +536,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ third_round = """ @@ -628,6 +638,8 @@ class ModelFnTests(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 2 num_layers_attempted: 3 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ return (first_round, second_round, third_round) diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py index a223241e893..d5f0c22d6e0 100644 --- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py +++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py @@ -36,16 +36,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase): resources.initialize_resources(resources.shared_resources()).run() stamp_token = ensemble.get_stamp_token() self.assertEqual(0, stamp_token.eval()) - (_, num_trees, num_finalized_trees, - num_attempted_layers) = ensemble.get_states() + (_, num_trees, num_finalized_trees, num_attempted_layers, + nodes_range) = ensemble.get_states() self.assertEqual(0, num_trees.eval()) self.assertEqual(0, num_finalized_trees.eval()) self.assertEqual(0, num_attempted_layers.eval()) + self.assertAllEqual([0, 1], nodes_range.eval()) def testCreateWithProto(self): with self.test_session(): ensemble_proto = boosted_trees_pb2.TreeEnsemble() - text_format.Merge(""" + text_format.Merge( + """ trees { nodes { bucketized_split { @@ -141,6 +143,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 2 num_layers_attempted: 6 + last_layer_node_start: 16 + last_layer_node_end: 19 } """, ensemble_proto) ensemble = boosted_trees_ops.TreeEnsemble( @@ -148,28 +152,31 @@ class ResourceOpsTest(test_util.TensorFlowTestCase): stamp_token=7, serialized_proto=ensemble_proto.SerializeToString()) resources.initialize_resources(resources.shared_resources()).run() - (stamp_token, num_trees, num_finalized_trees, - num_attempted_layers) = ensemble.get_states() + (stamp_token, num_trees, num_finalized_trees, num_attempted_layers, + nodes_range) = ensemble.get_states() self.assertEqual(7, stamp_token.eval()) self.assertEqual(2, num_trees.eval()) self.assertEqual(1, num_finalized_trees.eval()) self.assertEqual(6, num_attempted_layers.eval()) + self.assertAllEqual([16, 19], nodes_range.eval()) def testSerializeDeserialize(self): with self.test_session(): # Initialize. ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5) resources.initialize_resources(resources.shared_resources()).run() - (stamp_token, num_trees, num_finalized_trees, - num_attempted_layers) = ensemble.get_states() + (stamp_token, num_trees, num_finalized_trees, num_attempted_layers, + nodes_range) = ensemble.get_states() self.assertEqual(5, stamp_token.eval()) self.assertEqual(0, num_trees.eval()) self.assertEqual(0, num_finalized_trees.eval()) self.assertEqual(0, num_attempted_layers.eval()) + self.assertAllEqual([0, 1], nodes_range.eval()) # Deserialize. ensemble_proto = boosted_trees_pb2.TreeEnsemble() - text_format.Merge(""" + text_format.Merge( + """ trees { nodes { bucketized_split { @@ -201,6 +208,8 @@ class ResourceOpsTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 5 + last_layer_node_start: 3 + last_layer_node_end: 7 } """, ensemble_proto) with ops.control_dependencies([ @@ -208,13 +217,15 @@ class ResourceOpsTest(test_util.TensorFlowTestCase): stamp_token=3, serialized_proto=ensemble_proto.SerializeToString()) ]): - (stamp_token, num_trees, num_finalized_trees, - num_attempted_layers) = ensemble.get_states() + (stamp_token, num_trees, num_finalized_trees, num_attempted_layers, + nodes_range) = ensemble.get_states() self.assertEqual(3, stamp_token.eval()) self.assertEqual(1, num_trees.eval()) # This reads from metadata, not really counting the layers. self.assertEqual(5, num_attempted_layers.eval()) self.assertEqual(0, num_finalized_trees.eval()) + self.assertAllEqual([3, 7], nodes_range.eval()) + # Serialize. new_ensemble_proto = boosted_trees_pb2.TreeEnsemble() diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py index a54cc43517f..4d09cf94d42 100644 --- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py +++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py @@ -29,7 +29,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase): """Testing Gain calculation without any regularization.""" with self.test_session() as sess: max_splits = 7 - node_id_range = [1, 2] # node 1 through 2 will be processed. + node_id_range = [1, 3] # node 1 through 2 will be processed. stats_summary_list = [ [ [[0., 0.], [.08, .09], [0., 0.], [0., 0.]], # node 0; ignored @@ -76,7 +76,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase): """Testing Gain calculation with L2.""" with self.test_session() as sess: max_splits = 7 - node_id_range = [1, 2] # node 1 through 2 will be processed. + node_id_range = [1, 3] # node 1 through 2 will be processed. stats_summary_list = [ [ [[0., 0.], [.08, .09], [0., 0.], [0., 0.]], # node 0; ignored @@ -123,7 +123,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase): """Testing Gain calculation with L1.""" with self.test_session() as sess: max_splits = 7 - node_id_range = [1, 2] # node 1 through 2 will be processed. + node_id_range = [1, 3] # node 1 through 2 will be processed. stats_summary_list = [ [ [[0., 0.], [.08, .09], [0., 0.], [0., 0.]], # node 0; ignored @@ -173,7 +173,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase): """Testing Gain calculation with L2.""" with self.test_session() as sess: max_splits = 7 - node_id_range = [1, 2] # node 1 through 2 will be processed. + node_id_range = [1, 3] # node 1 through 2 will be processed. stats_summary_list = [ [ [[0., 0.], [.08, .09], [0., 0.], [0., 0.]], # node 0; ignored diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py index 4226ff75c23..d6c00477474 100644 --- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py +++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py @@ -132,6 +132,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ self.assertEqual(new_stamp, 1) @@ -314,6 +316,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ self.assertEqual(new_stamp, 1) @@ -461,6 +465,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 2 num_layers_attempted: 2 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ self.assertEqual(new_stamp, 1) @@ -615,6 +621,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 3 + last_layer_node_end: 5 } """ self.assertEqual(new_stamp, 1) @@ -624,7 +632,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): """Test that the metadata is updated even though we can't split.""" with self.test_session() as session: tree_ensemble_config = boosted_trees_pb2.TreeEnsemble() - text_format.Merge(""" + text_format.Merge( + """ trees { nodes { bucketized_split { @@ -655,6 +664,9 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 + } """, tree_ensemble_config) @@ -685,7 +697,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): # Expect no new splits created, but attempted (global) stats updated. Meta # data for this tree should not be updated (we didn't succeed building a - # layer. + # layer. Node ranges don't change. new_stamp, serialized = session.run(tree_ensemble.serialize()) tree_ensemble = boosted_trees_pb2.TreeEnsemble() tree_ensemble.ParseFromString(serialized) @@ -721,6 +733,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ self.assertEqual(new_stamp, 1) @@ -730,7 +744,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): """Test metadata is updated correctly when no split due to prepruning.""" with self.test_session() as session: tree_ensemble_config = boosted_trees_pb2.TreeEnsemble() - text_format.Merge(""" + text_format.Merge( + """ trees { nodes { bucketized_split { @@ -761,6 +776,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 } """, tree_ensemble_config) @@ -851,6 +868,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ self.assertEqual(new_stamp, 1) @@ -941,6 +960,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ self.assertEqual(new_stamp, 1) @@ -1046,6 +1067,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 3 + last_layer_node_end: 7 } """ self.assertEqual(new_stamp, 2) @@ -1179,6 +1202,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 3 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ self.assertEqual(new_stamp, 3) @@ -1268,6 +1293,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 1 + last_layer_node_end: 3 } """ self.assertEqual(new_stamp, 1) @@ -1307,7 +1334,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): # Expect the ensemble to be empty as post-pruning will prune # the entire finalized tree. self.assertEqual(new_stamp, 2) - self.assertProtoEquals(""" + self.assertProtoEquals( + """ trees { nodes { leaf { @@ -1359,6 +1387,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 2 + last_layer_node_start: 0 + last_layer_node_end: 1 } """, res_ensemble) @@ -1455,6 +1485,8 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase): growing_metadata { num_trees_attempted: 1 num_layers_attempted: 1 + last_layer_node_start: 0 + last_layer_node_end: 1 } """ self.assertEqual(new_stamp, 1) diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py index 174d00987f9..2a2bcdd9d69 100644 --- a/tensorflow/python/ops/boosted_trees_ops.py +++ b/tensorflow/python/ops/boosted_trees_ops.py @@ -115,7 +115,7 @@ class TreeEnsemble(object): def get_stamp_token(self): """Returns the current stamp token of the resource.""" - stamp_token, _, _, _ = ( + stamp_token, _, _, _, _ = ( gen_boosted_trees_ops.boosted_trees_get_ensemble_states( self.resource_handle)) return stamp_token @@ -124,17 +124,20 @@ class TreeEnsemble(object): """Returns states of the tree ensemble. Returns: - stamp_token, num_trees, num_finalized_trees, num_attempted_layers. + stamp_token, num_trees, num_finalized_trees, num_attempted_layers and + range of the nodes in the latest layer. """ - stamp_token, num_trees, num_finalized_trees, num_attempted_layers = ( - gen_boosted_trees_ops.boosted_trees_get_ensemble_states( - self.resource_handle)) + (stamp_token, num_trees, num_finalized_trees, num_attempted_layers, + nodes_range) = ( + gen_boosted_trees_ops.boosted_trees_get_ensemble_states( + self.resource_handle)) # Use identity to give names. return (array_ops.identity(stamp_token, name='stamp_token'), array_ops.identity(num_trees, name='num_trees'), array_ops.identity(num_finalized_trees, name='num_finalized_trees'), array_ops.identity( - num_attempted_layers, name='num_attempted_layers')) + num_attempted_layers, name='num_attempted_layers'), + array_ops.identity(nodes_range, name='last_layer_nodes_range')) def serialize(self): """Serializes the ensemble into proto and returns the serialized proto. From 33c737b70d42e05cabc43b4c6e778e988b6d0a9e Mon Sep 17 00:00:00 2001 From: Younghee Kwon Date: Wed, 11 Apr 2018 16:59:45 -0700 Subject: [PATCH 0101/1734] boosted_trees: make sure ensemble deserialization happens for the non-TRAIN modes too. PiperOrigin-RevId: 192532297 --- .../python/estimator/canned/boosted_trees.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py index 58af59dbb17..0ecc8c7089a 100644 --- a/tensorflow/python/estimator/canned/boosted_trees.py +++ b/tensorflow/python/estimator/canned/boosted_trees.py @@ -317,27 +317,28 @@ def _bt_model_fn( head.logits_dimension) # Create Ensemble resources. - if is_single_machine: - tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name) - local_tree_ensemble = tree_ensemble - ensemble_reload = control_flow_ops.no_op() - else: - tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name) - with ops.device(worker_device): - local_tree_ensemble = boosted_trees_ops.TreeEnsemble( - name=name + '_local', is_local=True) - # TODO(soroush): Do partial updates if this becomes a bottleneck. - ensemble_reload = local_tree_ensemble.deserialize( - *tree_ensemble.serialize()) - + tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name) # Create logits. if mode != model_fn.ModeKeys.TRAIN: logits = boosted_trees_ops.predict( - tree_ensemble_handle=local_tree_ensemble.resource_handle, + # For non-TRAIN mode, ensemble doesn't change after initialization, + # so no local copy is needed; using tree_ensemble directly. + tree_ensemble_handle=tree_ensemble.resource_handle, bucketized_features=input_feature_list, logits_dimension=head.logits_dimension, max_depth=tree_hparams.max_depth) else: + if is_single_machine: + local_tree_ensemble = tree_ensemble + ensemble_reload = control_flow_ops.no_op() + else: + # Have a local copy of ensemble for the distributed setting. + with ops.device(worker_device): + local_tree_ensemble = boosted_trees_ops.TreeEnsemble( + name=name + '_local', is_local=True) + # TODO(soroush): Do partial updates if this becomes a bottleneck. + ensemble_reload = local_tree_ensemble.deserialize( + *tree_ensemble.serialize()) if cache: cached_tree_ids, cached_node_ids, cached_logits = cache.lookup() else: From fa6150d369ea40b795a17221e6f5a0bf054a8cc8 Mon Sep 17 00:00:00 2001 From: Sami Kama Date: Fri, 13 Apr 2018 15:01:07 -0700 Subject: [PATCH 0102/1734] Adding py_test for TF-TRT integration --- tensorflow/contrib/tensorrt/BUILD | 9 + .../contrib/tensorrt/test/test_integration.py | 178 ++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 tensorflow/contrib/tensorrt/test/test_integration.py diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index fd3582e175e..d116114db06 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -272,3 +272,12 @@ tf_cc_test( "//tensorflow/core:test_main", ], ) + +py_test( + name = "tf_trt_integration_test", + srcs = ["test/test_integration.py"], + srcs_version = "PY2AND3", + deps = [ + ":init_py" + ] +) \ No newline at end of file diff --git a/tensorflow/contrib/tensorrt/test/test_integration.py b/tensorflow/contrib/tensorrt/test/test_integration.py new file mode 100644 index 00000000000..8ad26c3f693 --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/test_integration.py @@ -0,0 +1,178 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Script to test TF-TensorRT integration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib import tensorrt as trt +from tensorflow.core.protobuf import config_pb2 as cpb2 +from tensorflow.python.client import session as csess +from tensorflow.python.framework import test_util +from tensorflow.python.framework import constant_op as cop +from tensorflow.python.framework import dtypes as dtypes +from tensorflow.python.framework import importer as importer +from tensorflow.python.framework import ops as ops +from tensorflow.python.ops import array_ops as aops +from tensorflow.python.ops import nn as nn +from tensorflow.python.ops import nn_ops as nn_ops +from tensorflow.python.platform import googletest +from tensorflow.python.platform import test + + +@test_util.with_c_api +class IntegrationTest(test_util.TensofFlowTestCase): + + def setUp(self): + """ Setup method """ + super(IntegrationTest, self).setUp() + warnings.simplefilter('always') + inp_dims = (100, 24, 24, 2) + self._input = np.random.random_sample(inp_dims) + self._original_graph = get_simple_graph_def() + self._gpu_options = cpb2.GPUOptions( + per_process_gpu_memory_fraction=0.50) + self._config = cpb2.ConfigProto(gpu_options=gpu_options) + self._reference = self.run_graph(self._original_graph, self._input) + + def get_simple_graph_def(self): + """Create a simple graph and return its graph_def.""" + g = ops.Graph() + with g.as_default(): + a = aops.placeholder( + dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input") + e = cop.constant( + [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]], + name="weights", + dtype=dtypes.float32) + conv = nn.conv2d( + input=a, + filter=e, + strides=[1, 2, 2, 1], + padding="SAME", + name="conv") + b = cop.constant( + [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32) + t = nn.bias_add(conv, b, name="biasAdd") + relu = nn.relu(t, "relu") + idty = aops.identity(relu, "ID") + v = nn_ops.max_pool( + idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool") + aops.squeeze(v, name="output") + return g.as_graph_def() + + def run_graph(self, gdef, dumm_inp): + """Run given graphdef once.""" + ops.reset_default_graph() + g = ops.Graph() + with g.as_default(): + inp, out = importer.import_graph_def( + graph_def=gdef, return_elements=["input", "output"]) + inp = inp.outputs[0] + out = out.outputs[0] + with self.test_session( + grap=g, config=self._config, use_gpu=True, + force_gpu=True) as sess: + val = sess.run(out, {inp: dumm_inp}) + return val + + # Use real data that is representative of the inference dataset + # for calibration. For this test script it is random data. + def run_calibration(self, gdef, dumm_inp): + """Run given calibration graph multiple times.""" + ops.reset_default_graph() + g = ops.Graph() + with g.as_default(): + inp, out = importer.import_graph_def( + graph_def=gdef, return_elements=["input", "output"]) + inp = inp.outputs[0] + out = out.outputs[0] + # run over real calibration data here, we are mimicking a calibration set of + # 30 different batches. Use as much calibration data as you want + with self.test_session( + grap=g, config=self._config, use_gpu=True, + force_gpu=True) as sess: + for _ in range(30): + val = sess.run(out, {inp: dumm_inp}) + return val + + def get_trt_graph(self, mode): + """ return trt converted graph """ + if mode == "FP32": + return trt.create_inference_graph( + input_graph_def=self._orig_graph, + outputs=["output"], + max_batch_size=inp_dims[0], + max_workspace_size_bytes=1 << 25, + precision_mode= + "FP32", # TRT Engine precision "FP32","FP16" or "INT8" + minimum_segment_size=2 # minimum number of nodes in an engine + ) + elif mode == "FP16": + return trt.create_inference_graph( + input_graph_def=self._orig_graph, + outputs=["output"], + max_batch_size=inp_dims[0], + max_workspace_size_bytes=1 << 25, + precision_mode= + "FP16", # TRT Engine precision "FP32","FP16" or "INT8" + minimum_segment_size=2 # minimum number of nodes in an engine + ) + elif mode == "INT8": + return trt.create_inference_graph( + input_graph_def=self._orig_graph, + outputs=["output"], + max_batch_size=inp_dims[0], + max_workspace_size_bytes=1 << 25, + precision_mode= + "INT8", # TRT Engine precision "FP32","FP16" or "INT8" + minimum_segment_size=2 # minimum number of nodes in an engine + ) + + return None + + def testFP32(self): + """ Test FP32 conversion. Results should be identical to native case """ + trt_graph = self.get_trt_graph("FP32") + result = self.run_graph(trt_graph, self._input) + self.assertAllEqual(self._reference, result) + result = self.run_graph(trt_graph, self._input) + self.assertAllEqual(self._reference, result) + + def testFP16(self): + """ Test FP16 conversion. Results may be different from native case """ + trt_graph = self.get_trt_graph("FP16") + result = self.run_graph(trt_graph, self._input) + self.assertAllEqual(self._reference, result) + result = self.run_graph(trt_graph, self._input) + self.assertAllEqual(self._reference, result) + + def testINT8(self): + """ Test INT8 conversion. Results may be different from native case """ + calib_graph = self.get_trt_graph("INT8") + result = self.run_calibration(calib_graph, self._input) + self.assertAllEqual(self._reference, result) + int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) + result = self.run_graph(int8_graph, self._input) + self.assertAllEqual(self._reference, result) + result = self.run_graph(int8_graph, self._input) + self.assertAllEqual(self._reference, result) + + +if __name__ == '__main__': + googletest.main() From 9fb54c30efdcf38ef83c2709a8619a5bf20f2434 Mon Sep 17 00:00:00 2001 From: Sami Kama Date: Fri, 13 Apr 2018 15:18:48 -0700 Subject: [PATCH 0103/1734] Fix testing --- .../contrib/tensorrt/test/test_integration.py | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tensorflow/contrib/tensorrt/test/test_integration.py b/tensorflow/contrib/tensorrt/test/test_integration.py index 8ad26c3f693..97915c26590 100644 --- a/tensorflow/contrib/tensorrt/test/test_integration.py +++ b/tensorflow/contrib/tensorrt/test/test_integration.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import numpy as np +import warnings from tensorflow.contrib import tensorrt as trt from tensorflow.core.protobuf import config_pb2 as cpb2 @@ -36,7 +37,7 @@ from tensorflow.python.platform import test @test_util.with_c_api -class IntegrationTest(test_util.TensofFlowTestCase): +class IntegrationTest(test_util.TensorFlowTestCase): def setUp(self): """ Setup method """ @@ -44,10 +45,10 @@ class IntegrationTest(test_util.TensofFlowTestCase): warnings.simplefilter('always') inp_dims = (100, 24, 24, 2) self._input = np.random.random_sample(inp_dims) - self._original_graph = get_simple_graph_def() + self._original_graph = self.get_simple_graph_def() self._gpu_options = cpb2.GPUOptions( per_process_gpu_memory_fraction=0.50) - self._config = cpb2.ConfigProto(gpu_options=gpu_options) + self._config = cpb2.ConfigProto(gpu_options=self._gpu_options) self._reference = self.run_graph(self._original_graph, self._input) def get_simple_graph_def(self): @@ -86,7 +87,7 @@ class IntegrationTest(test_util.TensofFlowTestCase): inp = inp.outputs[0] out = out.outputs[0] with self.test_session( - grap=g, config=self._config, use_gpu=True, + graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess: val = sess.run(out, {inp: dumm_inp}) return val @@ -105,7 +106,7 @@ class IntegrationTest(test_util.TensofFlowTestCase): # run over real calibration data here, we are mimicking a calibration set of # 30 different batches. Use as much calibration data as you want with self.test_session( - grap=g, config=self._config, use_gpu=True, + graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess: for _ in range(30): val = sess.run(out, {inp: dumm_inp}) @@ -115,9 +116,9 @@ class IntegrationTest(test_util.TensofFlowTestCase): """ return trt converted graph """ if mode == "FP32": return trt.create_inference_graph( - input_graph_def=self._orig_graph, + input_graph_def=self._original_graph, outputs=["output"], - max_batch_size=inp_dims[0], + max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode= "FP32", # TRT Engine precision "FP32","FP16" or "INT8" @@ -125,9 +126,9 @@ class IntegrationTest(test_util.TensofFlowTestCase): ) elif mode == "FP16": return trt.create_inference_graph( - input_graph_def=self._orig_graph, + input_graph_def=self._original_graph, outputs=["output"], - max_batch_size=inp_dims[0], + max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode= "FP16", # TRT Engine precision "FP32","FP16" or "INT8" @@ -135,9 +136,9 @@ class IntegrationTest(test_util.TensofFlowTestCase): ) elif mode == "INT8": return trt.create_inference_graph( - input_graph_def=self._orig_graph, + input_graph_def=self._original_graph, outputs=["output"], - max_batch_size=inp_dims[0], + max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode= "INT8", # TRT Engine precision "FP32","FP16" or "INT8" @@ -151,27 +152,27 @@ class IntegrationTest(test_util.TensofFlowTestCase): trt_graph = self.get_trt_graph("FP32") result = self.run_graph(trt_graph, self._input) self.assertAllEqual(self._reference, result) - result = self.run_graph(trt_graph, self._input) - self.assertAllEqual(self._reference, result) + result1 = self.run_graph(trt_graph, self._input) + self.assertAllEqual(result1, result) def testFP16(self): """ Test FP16 conversion. Results may be different from native case """ trt_graph = self.get_trt_graph("FP16") result = self.run_graph(trt_graph, self._input) - self.assertAllEqual(self._reference, result) - result = self.run_graph(trt_graph, self._input) - self.assertAllEqual(self._reference, result) + self.assertAllClose(self._reference, result,rtol=1.e-03) + result1 = self.run_graph(trt_graph, self._input) + self.assertAllEqual(result1, result) def testINT8(self): """ Test INT8 conversion. Results may be different from native case """ calib_graph = self.get_trt_graph("INT8") result = self.run_calibration(calib_graph, self._input) self.assertAllEqual(self._reference, result) - int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) + int8_graph = trt.calib_graph_to_infer_graph(calib_graph) result = self.run_graph(int8_graph, self._input) - self.assertAllEqual(self._reference, result) - result = self.run_graph(int8_graph, self._input) - self.assertAllEqual(self._reference, result) + self.assertAllClose(self._reference, result,rtol=1.e-03) + result1 = self.run_graph(int8_graph, self._input) + self.assertAllEqual(result1, result) if __name__ == '__main__': From 6048b07adb364fcef086fb30ecdfb8a2881ba6ac Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Fri, 13 Apr 2018 17:13:45 -0700 Subject: [PATCH 0104/1734] TFLite: Copy output data from BufferHandle to CPU memory by default. PiperOrigin-RevId: 192846824 --- tensorflow/contrib/lite/interpreter.cc | 6 ++++++ tensorflow/contrib/lite/interpreter.h | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index f2586546088..31b874a6a65 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -570,6 +570,12 @@ TfLiteStatus Interpreter::Invoke() { } } + if (!allow_buffer_handle_output_) { + for (int tensor_index : outputs_) { + EnsureTensorDataIsReadable(tensor_index); + } + } + return status; } diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index df67cce9de5..3c776aacb6b 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -282,6 +282,7 @@ class Interpreter { // Ensure the data in `tensor.data` is readable. In case delegate is used, // it might require to copy the data from delegate buffer to raw memory. + // WARNING: This is an experimental API and subject to change. TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) { TF_LITE_ENSURE(&context_, tensor_index < tensors_size()); TfLiteTensor* tensor = &tensors_[tensor_index]; @@ -328,6 +329,18 @@ class Interpreter { // pointers to existing tensors. static constexpr int kTensorsCapacityHeadroom = 16; + // Set if buffer handle output is allowed. + // + // When using hardware delegation, Interpreter will make the data of output + // tensors available in `tensor->data` by default. If the application can + // consume the buffer handle directly (e.g. reading output from OpenGL + // texture), it can set this flag to false, so Interpreter won't copy the data + // from buffer handle to CPU memory. + // WARNING: This is an experimental API and subject to change. + void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) { + allow_buffer_handle_output_ = allow_buffer_handle_output; + } + private: // Give 'op_reg' a chance to initialize itself using the contents of // 'buffer'. @@ -518,6 +531,9 @@ class Interpreter { std::unique_ptr nnapi_delegate_; std::unique_ptr memory_planner_; + + // WARNING: This is an experimental interface that is subject to change. + bool allow_buffer_handle_output_ = false; }; } // namespace tflite From 360c5a37957311657d45c351248aaa8e8fcac3be Mon Sep 17 00:00:00 2001 From: James Qin Date: Fri, 13 Apr 2018 17:26:46 -0700 Subject: [PATCH 0105/1734] Revamp Cudnn RNN kernels for incoming autotune changes. * Create DoForward() and DoBackward() to be used by fwd/bak kernels and later autotune. * Simplify CudnnRnnForward Comupute() function. Offload the majority of its logic to other member functions. PiperOrigin-RevId: 192848100 --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 703 ++++++++++++++--------- 1 file changed, 417 insertions(+), 286 deletions(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index e4036ddaa9b..a21f13a4ddc 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -78,6 +78,7 @@ using CPUDevice = Eigen::ThreadPoolDevice; #if GOOGLE_CUDA using GPUDevice = Eigen::GpuDevice; +using ::perftools::gputools::StreamExecutor; template class CudnnRNNParamsSizeOp; @@ -101,15 +102,21 @@ enum class TFRNNInputMode { }; namespace { -using perftools::gputools::DeviceMemory; -using perftools::gputools::DeviceMemoryBase; -using perftools::gputools::ScratchAllocator; -using perftools::gputools::dnn::AlgorithmConfig; -using perftools::gputools::dnn::RnnDirectionMode; -using perftools::gputools::dnn::RnnInputMode; -using perftools::gputools::dnn::RnnMode; -using perftools::gputools::dnn::ToDataType; -using perftools::gputools::port::StatusOr; +using ::perftools::gputools::DeviceMemory; +using ::perftools::gputools::DeviceMemoryBase; +using ::perftools::gputools::ScratchAllocator; +using ::perftools::gputools::Stream; +using ::perftools::gputools::dnn::AlgorithmConfig; +using ::perftools::gputools::dnn::AlgorithmDesc; +using ::perftools::gputools::dnn::ProfileResult; +using ::perftools::gputools::dnn::RnnDescriptor; +using ::perftools::gputools::dnn::RnnDirectionMode; +using ::perftools::gputools::dnn::RnnInputMode; +using ::perftools::gputools::dnn::RnnMode; +using ::perftools::gputools::dnn::RnnSequenceTensorDescriptor; +using ::perftools::gputools::dnn::RnnStateTensorDescriptor; +using ::perftools::gputools::dnn::ToDataType; +using ::perftools::gputools::port::StatusOr; Status ParseRNNMode(const string& str, RnnMode* rnn_mode) { if (str == "rnn_relu") { @@ -252,12 +259,12 @@ class CudnnRnnAllocatorInTemp : public ScratchAllocator { explicit CudnnRnnAllocatorInTemp(OpKernelContext* context) : context_(context) {} - int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override { + int64 GetMemoryLimitInBytes(Stream* stream) override { return std::numeric_limits::max(); } - StatusOr> AllocateBytes( - perftools::gputools::Stream* stream, int64 byte_size) override { + StatusOr> AllocateBytes(Stream* stream, + int64 byte_size) override { Tensor temporary_memory; const DataType tf_data_type = ToTFDataType::value; int64 allocate_count = @@ -298,11 +305,11 @@ class CudnnRnnAllocatorInOutput : public ScratchAllocator { ~CudnnRnnAllocatorInOutput() override {} CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index) : context_(context), output_index_(output_index) {} - int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override { + int64 GetMemoryLimitInBytes(Stream* stream) override { return std::numeric_limits::max(); } - StatusOr> AllocateBytes( - perftools::gputools::Stream* stream, int64 byte_size) override { + StatusOr> AllocateBytes(Stream* stream, + int64 byte_size) override { CHECK(total_byte_size_ == 0) << "Reserve space allocator can only be called once"; int64 allocate_count = @@ -338,12 +345,12 @@ class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator { ~CudnnRNNPersistentSpaceAllocator() override {} - int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override { + int64 GetMemoryLimitInBytes(Stream* stream) override { return std::numeric_limits::max(); } - StatusOr> AllocateBytes( - perftools::gputools::Stream* stream, int64 byte_size) override { + StatusOr> AllocateBytes(Stream* stream, + int64 byte_size) override { if (total_byte_size_ != 0) { return Status(error::FAILED_PRECONDITION, "Persistent space allocator can only be called once"); @@ -374,6 +381,13 @@ struct CudnnModelTypes { // input-h. return rnn_mode == RnnMode::kRnnLstm; } + + string DebugString() const { + return strings::Printf( + "[rnn_mode, rnn_input_mode, rnn_direction_mode]: %d, %d, %d ", + static_cast(rnn_mode), static_cast(rnn_input_mode), + static_cast(rnn_direction_mode)); + } }; // A helper class that collects the shapes to describe a RNN model. @@ -381,9 +395,9 @@ struct CudnnRnnModelShapes { int num_layers; int input_size; int num_units; + int dir_count; int seq_length; int batch_size; - int dir_count; TensorShape input_shape; TensorShape output_shape; TensorShape hidden_state_shape; @@ -392,10 +406,11 @@ struct CudnnRnnModelShapes { return num_layers == rhs.num_layers && input_size == rhs.input_size && num_units == rhs.num_units && dir_count == rhs.dir_count; } - string RnnDescDebugString() { + string DebugString() const { return strings::Printf( - "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]", - num_layers, input_size, num_units, dir_count); + "[num_layers, input_size, num_units, dir_count, seq_length, " + "batch_size]: [%d, %d, %d, %d, %d, %d] ", + num_layers, input_size, num_units, dir_count, seq_length, batch_size); } }; @@ -420,8 +435,15 @@ struct CudnnRnnModelShapesComparator { } }; -// Extract and checks the forward input tensors, parameters, and shapes from -// the OpKernelContext. +// Pointers to RNN scratch space for a specific set of shape parameters (used as +// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp). +struct RnnScratchSpace { + std::unique_ptr rnn_desc; + std::unique_ptr dropout_state_allocator; +}; + +// Extract and checks the forward input tensors, parameters, and shapes from the +// OpKernelContext. Status ExtractForwardInput(OpKernelContext* context, const CudnnModelTypes& model_types, const Tensor** input, const Tensor** input_h, @@ -474,13 +496,171 @@ Status ExtractForwardInput(OpKernelContext* context, return Status::OK(); } -using perftools::gputools::dnn::RnnDescriptor; +template +Status CreateForwardAndBackwardIODescriptors( + OpKernelContext* context, const CudnnRnnModelShapes& model_shapes, + std::unique_ptr* input_desc, + std::unique_ptr* state_desc, + std::unique_ptr* output_desc) { + StreamExecutor* executor = context->op_device_context()->stream()->parent(); + ::perftools::gputools::dnn::DataType data_type = ToDataType::value; + + const TensorShape& input_shape = model_shapes.input_shape; + const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + const TensorShape& output_shape = model_shapes.output_shape; + + DCHECK_EQ(input_shape.dims(), 3); + auto input_desc_s = executor->createRnnSequenceTensorDescriptor( + input_shape.dim_size(0), input_shape.dim_size(1), input_shape.dim_size(2), + data_type); + TF_RETURN_IF_ERROR(input_desc_s.status()); + *input_desc = input_desc_s.ConsumeValueOrDie(); + + DCHECK_EQ(hidden_state_shape.dims(), 3); + auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor( + hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1), + hidden_state_shape.dim_size(2), data_type); + TF_RETURN_IF_ERROR(hidden_state_desc_s.status()); + *state_desc = hidden_state_desc_s.ConsumeValueOrDie(); + + DCHECK_EQ(output_shape.dims(), 3); + auto output_desc_s = executor->createRnnSequenceTensorDescriptor( + output_shape.dim_size(0), output_shape.dim_size(1), + output_shape.dim_size(2), data_type); + TF_RETURN_IF_ERROR(output_desc_s.status()); + *output_desc = output_desc_s.ConsumeValueOrDie(); + return Status::OK(); +} + +template +Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc, + const CudnnModelTypes& model_types, + const CudnnRnnModelShapes& model_shapes, + /* forward inputs */ + const Tensor* input, const Tensor* input_h, + const Tensor* input_c, const Tensor* params, + const bool is_training, + /* forward outputs, outputs of the function */ + Tensor* output, Tensor* output_h, Tensor* output_c, + ScratchAllocator* reserve_space_allocator, + ScratchAllocator* workspace_allocator, + ProfileResult* output_profile_result) { + std::unique_ptr input_desc; + std::unique_ptr state_desc; + std::unique_ptr output_desc; + + TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors( + context, model_shapes, &input_desc, &state_desc, &output_desc)); + + auto input_data = AsDeviceMemory(input); + auto input_h_data = AsDeviceMemory(input_h); + DeviceMemory input_c_data; + if (model_types.HasInputC()) { + input_c_data = AsDeviceMemory(input_c); + } + auto params_data = AsDeviceMemory(params); + auto output_data = AsDeviceMemory(output); + auto output_h_data = AsDeviceMemory(output_h); + DeviceMemory output_c_data; + if (model_types.HasInputC()) { + output_c_data = AsDeviceMemory(output_c); + } + + Stream* stream = context->op_device_context()->stream(); + bool launch_success = + stream + ->ThenRnnForward(rnn_desc, *input_desc, input_data, *state_desc, + input_h_data, *state_desc, input_c_data, params_data, + *output_desc, &output_data, *state_desc, + &output_h_data, *state_desc, &output_c_data, + is_training, reserve_space_allocator, + workspace_allocator, output_profile_result) + .ok(); + return launch_success + ? Status::OK() + : errors::Internal( + "Failed to call ThenRnnForward with model config: ", + model_types.DebugString(), ", ", model_shapes.DebugString()); +} + +template +Status DoBackward( + OpKernelContext* context, const RnnDescriptor& rnn_desc, + const CudnnModelTypes& model_types, const CudnnRnnModelShapes& model_shapes, + /* forward inputs */ + const Tensor* input, const Tensor* input_h, const Tensor* input_c, + const Tensor* params, + /* forward outptus */ + const Tensor* output, const Tensor* output_h, const Tensor* output_c, + /* backprop inputs */ + const Tensor* output_backprop, const Tensor* output_h_backprop, + const Tensor* output_c_backprop, const Tensor* reserve_space, + /* backprop outputs, output of the function */ + Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop, + Tensor* params_backprop, ScratchAllocator* workspace_allocator, + ProfileResult* output_profile_result) { + std::unique_ptr input_desc; + std::unique_ptr state_desc; + std::unique_ptr output_desc; + + TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors( + context, model_shapes, &input_desc, &state_desc, &output_desc)); + + auto input_data = AsDeviceMemory(input); + auto input_h_data = AsDeviceMemory(input_h); + DeviceMemory input_c_data; + if (model_types.HasInputC()) { + input_c_data = AsDeviceMemory(input_c); + } + auto params_data = AsDeviceMemory(params); + auto output_data = AsDeviceMemory(output); + auto output_h_data = AsDeviceMemory(output_h); + DeviceMemory output_c_data; + if (model_types.HasInputC()) { + output_c_data = AsDeviceMemory(output_c); + } + auto output_backprop_data = AsDeviceMemory(output_backprop); + auto output_h_backprop_data = AsDeviceMemory(output_h_backprop); + DeviceMemory output_c_backprop_data; + if (model_types.HasInputC()) { + output_c_backprop_data = AsDeviceMemory(output_c_backprop); + } + auto input_backprop_data = AsDeviceMemory(input_backprop); + auto input_h_backprop_data = AsDeviceMemory(input_h_backprop); + DeviceMemory input_c_backprop_data; + if (model_types.HasInputC()) { + input_c_backprop_data = AsDeviceMemory(input_c_backprop); + } + auto params_backprop_data = AsDeviceMemory(params_backprop); + auto reserve_space_uint8 = + CastDeviceMemory(const_cast(reserve_space)); + + // Creates a memory callback for the workspace. The memory lives to the end + // of this kernel calls. + Stream* stream = context->op_device_context()->stream(); + bool launch_success = + stream + ->ThenRnnBackward(rnn_desc, *input_desc, input_data, *state_desc, + input_h_data, *state_desc, input_c_data, + params_data, *output_desc, output_data, *state_desc, + output_h_data, *state_desc, output_c_data, + output_backprop_data, output_h_backprop_data, + output_c_backprop_data, &input_backprop_data, + &input_h_backprop_data, &input_c_backprop_data, + ¶ms_backprop_data, &reserve_space_uint8, + workspace_allocator, output_profile_result) + .ok(); + return launch_success + ? Status::OK() + : errors::Internal( + "Failed to call ThenRnnBackward with model config: ", + model_types.DebugString(), ", ", model_shapes.DebugString()); +} template void RestoreParams(const OpInputList params_input, const std::vector& params, - DeviceMemoryBase* data_dst, - perftools::gputools::Stream* stream) { + DeviceMemoryBase* data_dst, Stream* stream) { int num_params = params.size(); CHECK(params_input.size() == num_params) << "Number of params mismatch. Expected " << params_input.size() @@ -570,7 +750,7 @@ class CudnnRNNKernelCommon : public OpKernel { TF_RETURN_IF_ERROR( ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode)); - auto* stream = context->op_device_context()->stream(); + Stream* stream = context->op_device_context()->stream(); // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require // random number generator, therefore set state_allocator to nullptr. const AlgorithmConfig algo_config; @@ -585,6 +765,51 @@ class CudnnRNNKernelCommon : public OpKernel { return Status::OK(); } + template + Status CreateRnnDescriptor(OpKernelContext* context, + const CudnnRnnModelShapes& model_shapes, + const RnnInputMode& input_mode, + const AlgorithmConfig& algo_config, + ScratchAllocator* dropout_state_allocator, + std::unique_ptr* rnn_desc) { + StreamExecutor* executor = context->op_device_context()->stream()->parent(); + ::perftools::gputools::dnn::DataType data_type = ToDataType::value; + auto rnn_desc_s = executor->createRnnDescriptor( + model_shapes.num_layers, model_shapes.num_units, + model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(), + data_type, algo_config, dropout(), seed(), dropout_state_allocator); + TF_RETURN_IF_ERROR(rnn_desc_s.status()); + + *rnn_desc = rnn_desc_s.ConsumeValueOrDie(); + return Status::OK(); + } + + using RnnStateCache = + gtl::FlatMap; + // Returns a raw rnn descriptor pointer. The cache owns the rnn descriptor and + // should outlive the returned pointer. + template + Status GetCachedRnnDescriptor(OpKernelContext* context, + const CudnnRnnModelShapes& model_shapes, + const RnnInputMode& input_mode, + const AlgorithmConfig& algo_config, + RnnStateCache* cache, + RnnDescriptor** rnn_desc) { + RnnScratchSpace& rnn_state = (*cache)[model_shapes]; + if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) { + CudnnRNNPersistentSpaceAllocator* dropout_state_allocator = + new CudnnRNNPersistentSpaceAllocator(context); + rnn_state.dropout_state_allocator.reset(dropout_state_allocator); + Status status = + CreateRnnDescriptor(context, model_shapes, input_mode, algo_config, + dropout_state_allocator, &rnn_state.rnn_desc); + TF_RETURN_IF_ERROR(status); + } + *rnn_desc = rnn_state.rnn_desc.get(); + return Status::OK(); + } + private: int seed_; int seed2_; @@ -648,7 +873,7 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(3); auto input_ptr = StreamExecutorUtil::AsDeviceMemory(input); - auto* stream = context->op_device_context()->stream(); + Stream* stream = context->op_device_context()->stream(); std::unique_ptr rnn_desc; OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo(context, &rnn_desc)); @@ -789,7 +1014,7 @@ class CudnnRNNCanonicalToParams : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, context->allocate_output(0, {params_size}, &output)); auto output_ptr = StreamExecutorUtil::AsDeviceMemory(*output); - auto* stream = context->op_device_context()->stream(); + Stream* stream = context->op_device_context()->stream(); OpInputList weights; OP_REQUIRES_OK(context, context->input_list("weights", &weights)); @@ -816,13 +1041,6 @@ TF_CALL_float(REGISTER_GPU); TF_CALL_double(REGISTER_GPU); #undef REGISTER_GPU -// Pointers to RNN scratch space for a specific set of shape parameters (used as -// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp). -struct RnnScratchSpace { - std::unique_ptr rnn_desc; - std::unique_ptr dropout_state_allocator; -}; - // Run the forward operation of the RNN model. template class CudnnRNNForwardOp : public CudnnRNNKernelCommon { @@ -842,115 +1060,71 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), &input, &input_h, &input_c, ¶ms, &model_shapes)); - const auto& input_shape = model_shapes.input_shape; - const auto& hidden_state_shape = model_shapes.hidden_state_shape; - const auto& output_shape = model_shapes.output_shape; - - Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); - Tensor* output_h = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(1, hidden_state_shape, &output_h)); - Tensor* output_c = nullptr; - if (HasInputC()) { - // Only LSTM uses input_c and output_c. So for all other models, we only - // need to create dummy outputs. - OP_REQUIRES_OK( - context, context->allocate_output(2, hidden_state_shape, &output_c)); - } else { - OP_REQUIRES_OK(context, context->allocate_output(2, {}, &output_c)); - } - - auto* stream = context->op_device_context()->stream(); - auto* executor = stream->parent(); RnnInputMode input_mode; OP_REQUIRES_OK(context, ToRNNInputMode(rnn_input_mode(), model_shapes.num_units, model_shapes.input_size, &input_mode)); - auto data_type = ToDataType::value; - auto input_desc_s = executor->createRnnSequenceTensorDescriptor( - input_shape.dim_size(0), input_shape.dim_size(1), - input_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s)); - auto input_desc = input_desc_s.ConsumeValueOrDie(); - - auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor( - hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1), - hidden_state_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s)); - auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie(); - - auto output_desc_s = executor->createRnnSequenceTensorDescriptor( - output_shape.dim_size(0), output_shape.dim_size(1), - output_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s)); - auto output_desc = output_desc_s.ConsumeValueOrDie(); - - auto input_data = AsDeviceMemory(input); - auto input_h_data = AsDeviceMemory(input_h); - DeviceMemory input_c_data; - if (HasInputC()) { - input_c_data = AsDeviceMemory(input_c); - } - auto params_data = AsDeviceMemory(params); - auto output_data = AsDeviceMemory(output); - auto output_h_data = AsDeviceMemory(output_h); - DeviceMemory output_c_data; - if (HasInputC()) { - output_c_data = AsDeviceMemory(output_c); - } + Tensor* output = nullptr; + Tensor* output_h = nullptr; + Tensor* output_c = nullptr; + OP_REQUIRES_OK(context, AllocateOutputs(context, model_shapes, &output, + &output_h, &output_c)); + AlgorithmConfig algo_config; // Creates a memory callback for the reserve_space. The memory lives in the // output of this kernel. And it will be fed into the backward pass when // needed. CudnnRnnAllocatorInOutput reserve_space_allocator(context, 3); - if (!is_training_) { - Tensor* dummy_reserve_space = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(3, {}, &dummy_reserve_space)); - } // Creates a memory callback for the workspace. The memory lives to the end // of this kernel calls. CudnnRnnAllocatorInTemp workspace_allocator(context); - bool launch_status = false; + Status launch_status; { mutex_lock l(mu_); - RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes]; - if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) { - CudnnRNNPersistentSpaceAllocator* dropout_state_allocator = - new CudnnRNNPersistentSpaceAllocator(context); - rnn_state.dropout_state_allocator.reset(dropout_state_allocator); - const AlgorithmConfig algo_config; - auto rnn_desc_s = executor->createRnnDescriptor( - model_shapes.num_layers, model_shapes.num_units, - model_shapes.input_size, input_mode, rnn_direction_mode(), - rnn_mode(), data_type, algo_config, dropout(), seed(), - dropout_state_allocator); - OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s)); - rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie()); - } - launch_status = - stream - ->ThenRnnForward( - *rnn_state.rnn_desc, *input_desc, input_data, - *hidden_state_desc, input_h_data, *hidden_state_desc, - input_c_data, params_data, *output_desc, &output_data, - *hidden_state_desc, &output_h_data, *hidden_state_desc, - &output_c_data, is_training_, &reserve_space_allocator, - &workspace_allocator, /*output_result_profile=*/nullptr) - .ok(); + RnnDescriptor* rnn_desc_ptr = nullptr; + OP_REQUIRES_OK( + context, GetCachedRnnDescriptor(context, model_shapes, input_mode, + algo_config, &rnn_state_cache_, + &rnn_desc_ptr)); + launch_status = DoForward( + context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h, + input_c, params, is_training_, output, output_h, output_c, + &reserve_space_allocator, &workspace_allocator, + /*output_profile_result=*/nullptr); } - OP_REQUIRES(context, launch_status, - errors::Internal("Failed to call ThenRnnForward")); + OP_REQUIRES_OK(context, launch_status); } private: + Status AllocateOutputs(OpKernelContext* context, + const CudnnRnnModelShapes& model_shapes, + Tensor** output, Tensor** output_h, + Tensor** output_c) { + const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + const TensorShape& output_shape = model_shapes.output_shape; + + TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, output)); + TF_RETURN_IF_ERROR( + context->allocate_output(1, hidden_state_shape, output_h)); + if (HasInputC()) { + TF_RETURN_IF_ERROR( + context->allocate_output(2, hidden_state_shape, output_c)); + } else { + // Only LSTM uses input_c and output_c. So for all other models, we only + // need to create dummy outputs. + TF_RETURN_IF_ERROR(context->allocate_output(2, {}, output_c)); + } + if (!is_training_) { + Tensor* dummy_reserve_space = nullptr; + TF_RETURN_IF_ERROR(context->allocate_output(3, {}, &dummy_reserve_space)); + } + return Status::OK(); + } + mutex mu_; bool is_training_; - std::unordered_map - rnn_state_cache_ GUARDED_BY(mu_); + RnnStateCache rnn_state_cache_ GUARDED_BY(mu_); }; #define REGISTER_GPU(T) \ @@ -981,184 +1155,141 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), &input, &input_h, &input_c, ¶ms, &model_shapes)); - - const auto& input_shape = model_shapes.input_shape; - const auto& hidden_state_shape = model_shapes.hidden_state_shape; - const auto& output_shape = model_shapes.output_shape; - - auto data_type = ToDataType::value; - const Tensor* output = nullptr; - OP_REQUIRES_OK(context, context->input("output", &output)); - OP_REQUIRES(context, output_shape == output->shape(), - errors::InvalidArgument( - "input_h and input_c must have the same shape: ", - input_h->shape().DebugString(), " ", - input_c->shape().DebugString())); - const Tensor* output_h = nullptr; - OP_REQUIRES_OK(context, context->input("output_h", &output_h)); - OP_REQUIRES(context, output_h->shape() == hidden_state_shape, - errors::InvalidArgument( - "Invalid output_h shape: ", output_h->shape().DebugString(), - " ", hidden_state_shape.DebugString())); - const Tensor* output_c = nullptr; - if (HasInputC()) { - // Only LSTM uses input_c and output_c. So for all other models, we only - // need to create dummy outputs. - OP_REQUIRES_OK(context, context->input("output_c", &output_c)); - OP_REQUIRES(context, output_c->shape() == hidden_state_shape, - errors::InvalidArgument("Invalid output_c shape: ", - output_c->shape().DebugString(), " ", - hidden_state_shape.DebugString())); - } - - const Tensor* output_backprop = nullptr; - OP_REQUIRES_OK(context, - context->input("output_backprop", &output_backprop)); - OP_REQUIRES(context, output_backprop->shape() == output_shape, - errors::InvalidArgument("Invalid output_backprop shapes: ", - output_backprop->shape().DebugString(), - " ", output_shape.DebugString())); - - const Tensor* output_h_backprop = nullptr; - OP_REQUIRES_OK(context, - context->input("output_h_backprop", &output_h_backprop)); - OP_REQUIRES( - context, output_h_backprop->shape() == hidden_state_shape, - errors::InvalidArgument("Invalid output_h_backprop shapes: ", - output_h_backprop->shape().DebugString(), " ", - hidden_state_shape.DebugString())); - const Tensor* output_c_backprop = nullptr; - if (HasInputC()) { - OP_REQUIRES_OK(context, - context->input("output_c_backprop", &output_c_backprop)); - OP_REQUIRES( - context, output_c_backprop->shape() == hidden_state_shape, - errors::InvalidArgument("Invalid output_c_backprop shapes: ", - output_c_backprop->shape().DebugString(), " ", - hidden_state_shape.DebugString())); - } - const Tensor* reserve_space_const = nullptr; - // This is the same "reserve_space" created by the forward op. - // It can also be modified by this backward operation. - OP_REQUIRES_OK(context, - context->input("reserve_space", &reserve_space_const)); - // Cudnn needs the reserve space to be writeable. This is fine because they - // are opaque. - Tensor* reserve_space = const_cast(reserve_space_const); - - Tensor* input_backprop = nullptr; - OP_REQUIRES_OK( - context, context->allocate_output(0, input->shape(), &input_backprop)); - Tensor* input_h_backprop = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(1, input_h->shape(), - &input_h_backprop)); - Tensor* input_c_backprop = nullptr; - if (HasInputC()) { - OP_REQUIRES_OK(context, context->allocate_output(2, input_c->shape(), - &input_c_backprop)); - } else { - OP_REQUIRES_OK(context, - context->allocate_output(2, {}, &input_c_backprop)); - } - Tensor* params_backprop = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(3, params->shape(), - ¶ms_backprop)); - - auto* stream = context->op_device_context()->stream(); - auto* executor = stream->parent(); RnnInputMode input_mode; OP_REQUIRES_OK(context, ToRNNInputMode(rnn_input_mode(), model_shapes.num_units, model_shapes.input_size, &input_mode)); - auto input_desc_s = executor->createRnnSequenceTensorDescriptor( - input_shape.dim_size(0), input_shape.dim_size(1), - input_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(input_desc_s)); - auto input_desc = input_desc_s.ConsumeValueOrDie(); + const Tensor* output = nullptr; + const Tensor* output_h = nullptr; + const Tensor* output_c = nullptr; + const Tensor* output_backprop = nullptr; + const Tensor* output_h_backprop = nullptr; + const Tensor* output_c_backprop = nullptr; + const Tensor* reserve_space = nullptr; + OP_REQUIRES_OK(context, + ExtractBackwardInputs(context, model_shapes, model_types(), + &output, &output_h, &output_c, + &output_backprop, &output_h_backprop, + &output_c_backprop, &reserve_space)); - auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor( - hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1), - hidden_state_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(hidden_state_desc_s)); - auto hidden_state_desc = hidden_state_desc_s.ConsumeValueOrDie(); + Tensor* input_backprop = nullptr; + Tensor* input_h_backprop = nullptr; + Tensor* input_c_backprop = nullptr; + Tensor* params_backprop = nullptr; + OP_REQUIRES_OK(context, + AllocateOutputs(context, model_shapes, params->shape(), + &input_backprop, &input_h_backprop, + &input_c_backprop, ¶ms_backprop)); - auto output_desc_s = executor->createRnnSequenceTensorDescriptor( - output_shape.dim_size(0), output_shape.dim_size(1), - output_shape.dim_size(2), data_type); - OP_REQUIRES_OK(context, FromExecutorStatus(output_desc_s)); - auto output_desc = output_desc_s.ConsumeValueOrDie(); - - auto input_data = AsDeviceMemory(input); - auto input_h_data = AsDeviceMemory(input_h); - DeviceMemory input_c_data; - if (HasInputC()) { - input_c_data = AsDeviceMemory(input_c); - } - auto params_data = AsDeviceMemory(params); - auto output_data = AsDeviceMemory(output); - auto output_h_data = AsDeviceMemory(output_h); - DeviceMemory output_c_data; - if (HasInputC()) { - output_c_data = AsDeviceMemory(output_c); - } - auto output_backprop_data = AsDeviceMemory(output_backprop); - auto output_h_backprop_data = AsDeviceMemory(output_h_backprop); - DeviceMemory output_c_backprop_data; - if (HasInputC()) { - output_c_backprop_data = AsDeviceMemory(output_c_backprop); - } - auto input_backprop_data = AsDeviceMemory(input_backprop); - auto input_h_backprop_data = AsDeviceMemory(input_h_backprop); - DeviceMemory input_c_backprop_data; - if (HasInputC()) { - input_c_backprop_data = AsDeviceMemory(input_c_backprop); - } - auto params_backprop_data = AsDeviceMemory(params_backprop); - auto reserve_space_uint8 = CastDeviceMemory(reserve_space); // Creates a memory callback for the workspace. The memory lives to the end // of this kernel calls. CudnnRnnAllocatorInTemp workspace_allocator(context); - bool launch_status = false; + const AlgorithmConfig default_algo_config; + Status launch_status; { mutex_lock l(mu_); - RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes]; - if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) { - CudnnRNNPersistentSpaceAllocator* dropout_state_allocator = - new CudnnRNNPersistentSpaceAllocator(context); - rnn_state.dropout_state_allocator.reset(dropout_state_allocator); - const AlgorithmConfig algo_config; - auto rnn_desc_s = executor->createRnnDescriptor( - model_shapes.num_layers, model_shapes.num_units, - model_shapes.input_size, input_mode, rnn_direction_mode(), - rnn_mode(), data_type, algo_config, dropout(), seed(), - dropout_state_allocator); - OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s)); - rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie()); - } - launch_status = - stream - ->ThenRnnBackward( - *rnn_state.rnn_desc, *input_desc, input_data, - *hidden_state_desc, input_h_data, *hidden_state_desc, - input_c_data, params_data, *output_desc, output_data, - *hidden_state_desc, output_h_data, *hidden_state_desc, - output_c_data, output_backprop_data, output_h_backprop_data, - output_c_backprop_data, &input_backprop_data, - &input_h_backprop_data, &input_c_backprop_data, - ¶ms_backprop_data, &reserve_space_uint8, - &workspace_allocator, /*output_result_profile=*/nullptr) - .ok(); + RnnDescriptor* rnn_desc_ptr = nullptr; + OP_REQUIRES_OK( + context, GetCachedRnnDescriptor(context, model_shapes, input_mode, + default_algo_config, + &rnn_state_cache_, &rnn_desc_ptr)); + launch_status = DoBackward( + context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h, + input_c, params, output, output_h, output_c, output_backprop, + output_h_backprop, output_c_backprop, reserve_space, input_backprop, + input_h_backprop, input_c_backprop, params_backprop, + &workspace_allocator, /*output_profile_result=*/nullptr); } - OP_REQUIRES(context, launch_status, - errors::Internal("Failed to call ThenRnnBackward")); + OP_REQUIRES_OK(context, launch_status); } private: mutex mu_; - std::unordered_map - rnn_state_cache_ GUARDED_BY(mu_); + RnnStateCache rnn_state_cache_ GUARDED_BY(mu_); + + Status ExtractBackwardInputs( + OpKernelContext* context, const CudnnRnnModelShapes& model_shapes, + const CudnnModelTypes& model_types, const Tensor** output, + const Tensor** output_h, const Tensor** output_c, + const Tensor** output_backprop, const Tensor** output_h_backprop, + const Tensor** output_c_backprop, const Tensor** reserve_space) { + TF_RETURN_IF_ERROR(context->input("output", output)); + TF_RETURN_IF_ERROR(context->input("output_backprop", output_backprop)); + TF_RETURN_IF_ERROR(context->input("output_h", output_h)); + TF_RETURN_IF_ERROR(context->input("output_h_backprop", output_h_backprop)); + if (model_types.HasInputC()) { + TF_RETURN_IF_ERROR(context->input("output_c", output_c)); + TF_RETURN_IF_ERROR( + context->input("output_c_backprop", output_c_backprop)); + } + TF_RETURN_IF_ERROR(context->input("reserve_space", reserve_space)); + const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + const TensorShape& output_shape = model_shapes.output_shape; + + if (output_shape != (*output)->shape()) { + return errors::InvalidArgument( + "Invalid output shape: ", (*output)->shape().DebugString(), " ", + output_shape.DebugString()); + } + if (hidden_state_shape != (*output_h)->shape()) { + return errors::InvalidArgument( + "Invalid output_h shape: ", (*output_h)->shape().DebugString(), " ", + hidden_state_shape.DebugString()); + } + + if (output_shape != (*output_backprop)->shape()) { + return errors::InvalidArgument("Invalid output_backprop shape: ", + (*output_backprop)->shape().DebugString(), + " ", output_shape.DebugString()); + } + if (hidden_state_shape != (*output_h_backprop)->shape()) { + return errors::InvalidArgument( + "Invalid output_h_backprop shape: ", + (*output_h_backprop)->shape().DebugString(), " ", + hidden_state_shape.DebugString()); + } + + if (model_types.HasInputC()) { + if (hidden_state_shape != (*output_c)->shape()) { + return errors::InvalidArgument( + "Invalid output_c shape: ", (*output_c)->shape().DebugString(), " ", + hidden_state_shape.DebugString()); + } + if (hidden_state_shape != (*output_c_backprop)->shape()) { + return errors::InvalidArgument( + "Invalid output_c_backprop shape: ", + (*output_c_backprop)->shape().DebugString(), " ", + hidden_state_shape.DebugString()); + } + } + return Status::OK(); + } + + Status AllocateOutputs(OpKernelContext* context, + const CudnnRnnModelShapes& model_shapes, + const TensorShape& params_shape, + Tensor** input_backprop, Tensor** input_h_backprop, + Tensor** input_c_backprop, Tensor** params_backprop) { + const TensorShape& input_shape = model_shapes.input_shape; + const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + + TF_RETURN_IF_ERROR( + context->allocate_output(0, input_shape, input_backprop)); + TF_RETURN_IF_ERROR( + context->allocate_output(1, hidden_state_shape, input_h_backprop)); + if (HasInputC()) { + TF_RETURN_IF_ERROR( + context->allocate_output(2, hidden_state_shape, input_c_backprop)); + } else { + // Only LSTM uses input_c and output_c. So for all other models, we only + // need to create dummy outputs. + TF_RETURN_IF_ERROR(context->allocate_output(2, {}, input_c_backprop)); + } + TF_RETURN_IF_ERROR( + context->allocate_output(3, params_shape, params_backprop)); + return Status::OK(); + } }; #define REGISTER_GPU(T) \ From a4b408543dd3b882131f522359bcb547c7972e4f Mon Sep 17 00:00:00 2001 From: Jeremy Lau Date: Fri, 13 Apr 2018 17:36:00 -0700 Subject: [PATCH 0106/1734] VLOG(1) all OutOfRange CtxFailures, and LOG(WARNING) all other CtxFailures. This unifies the logging behavior of the OP_REQUIRES and OP_REQUIRES_OK macros. PiperOrigin-RevId: 192848921 --- tensorflow/core/framework/op_kernel.cc | 48 +++++++++++++++----------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 05171006b0c..ca91d68f79f 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -1273,51 +1273,59 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const { } #endif +namespace { +template +void CtxFailureInternal(OpKernelT* op_kernel, const char* file, int line, + const Status& s) { + const string logging_prefix = + file == nullptr ? "CtxFailure: " + : strings::StrCat("CtxFailure at ", io::Basename(file), + ":", line, ": "); + + if (errors::IsOutOfRange(s)) { + // VLOG OutOfRange errors. Dataset ops create OutOfRange errors when they + // reach end-of-sequence. + VLOG(1) << logging_prefix << s; + } else { + LOG(WARNING) << logging_prefix << s; + } + op_kernel->SetStatus(s); +} +} // anonymous namespace + void OpKernelConstruction::CtxFailure(const Status& s) { - VLOG(1) << s; - SetStatus(s); + CtxFailureInternal(this, nullptr, 0, s); } void OpKernelConstruction::CtxFailureWithWarning(const Status& s) { - LOG(WARNING) << s; - SetStatus(s); + CtxFailureInternal(this, nullptr, 0, s); } void OpKernelConstruction::CtxFailure(const char* file, int line, const Status& s) { - VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line - << " : " << s; - SetStatus(s); + CtxFailureInternal(this, file, line, s); } void OpKernelConstruction::CtxFailureWithWarning(const char* file, int line, const Status& s) { - LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line - << " : " << s; - SetStatus(s); + CtxFailureInternal(this, file, line, s); } void OpKernelContext::CtxFailure(const Status& s) { - VLOG(1) << s; - SetStatus(s); + CtxFailureInternal(this, nullptr, 0, s); } void OpKernelContext::CtxFailureWithWarning(const Status& s) { - LOG(WARNING) << s; - SetStatus(s); + CtxFailureInternal(this, nullptr, 0, s); } void OpKernelContext::CtxFailure(const char* file, int line, const Status& s) { - VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line - << " : " << s; - SetStatus(s); + CtxFailureInternal(this, file, line, s); } void OpKernelContext::CtxFailureWithWarning(const char* file, int line, const Status& s) { - LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line - << " : " << s; - SetStatus(s); + CtxFailureInternal(this, file, line, s); } } // namespace tensorflow From 6e533eb718b33f23ab3f06025cbf680258534d76 Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Fri, 13 Apr 2018 17:47:58 -0700 Subject: [PATCH 0107/1734] Add a caveat about make_initiliazable_iterator to the README. PiperOrigin-RevId: 192850014 --- tensorflow/contrib/distribute/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md index 14de1e8f491..24827311987 100644 --- a/tensorflow/contrib/distribute/README.md +++ b/tensorflow/contrib/distribute/README.md @@ -130,6 +130,8 @@ adjusting your learning rate or batch size according to the number of GPUs. We are working on addressing this limitation by splitting each batch across GPUs instead. * PartitionedVariables are not supported yet. +* Input pipelines with Datasets that capture stateful objects and rely on +`make_initializable_iterator` are not supported yet. ## What's next? From ef24ad14502e992716c49fdd5c63e6b2c2fb6b5a Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Fri, 13 Apr 2018 17:51:37 -0700 Subject: [PATCH 0108/1734] Java: Bump release to 1.8.0-rc0 PiperOrigin-RevId: 192850310 --- tensorflow/java/maven/libtensorflow/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +- tensorflow/java/maven/pom.xml | 2 +- tensorflow/java/maven/proto/pom.xml | 2 +- tensorflow/java/maven/tensorflow/pom.xml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml index c99d04869a7..9c1601753bd 100644 --- a/tensorflow/java/maven/libtensorflow/pom.xml +++ b/tensorflow/java/maven/libtensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 ../ libtensorflow diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml index 4561c2c8ade..3d013e12b0d 100644 --- a/tensorflow/java/maven/libtensorflow_jni/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 ../ libtensorflow_jni diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml index 82a2b8e7694..40e44af1f53 100644 --- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 ../ libtensorflow_jni_gpu diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index 4c1ec0cc803..82bfd0c73ae 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 pom https://www.tensorflow.org diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml index fcd8236bad3..0a2775a500c 100644 --- a/tensorflow/java/maven/proto/pom.xml +++ b/tensorflow/java/maven/proto/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 ../ proto diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml index 241581713ad..61961432a7e 100644 --- a/tensorflow/java/maven/tensorflow/pom.xml +++ b/tensorflow/java/maven/tensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.7.0 + 1.8.0-rc0 ../ tensorflow From 3652556dab3ebfe0152232facc7304fe5754aecb Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Fri, 13 Apr 2018 17:52:20 -0700 Subject: [PATCH 0109/1734] Merge changes from github. PiperOrigin-RevId: 192850372 --- tensorflow/BUILD | 7 +- tensorflow/compiler/jit/BUILD | 1 + .../compiler/jit/mark_for_compilation_pass.cc | 4 + tensorflow/contrib/cmake/external/grpc.cmake | 1 + .../copy_graph/python/util/copy_elements.py | 4 +- tensorflow/contrib/data/__init__.py | 2 + .../contrib/data/python/kernel_tests/BUILD | 1 + .../kernel_tests/batch_dataset_op_test.py | 70 ++++ .../kernel_tests/sequence_dataset_op_test.py | 10 + tensorflow/contrib/data/python/ops/BUILD | 1 + .../contrib/data/python/ops/batching.py | 41 ++ .../contrib/distribute/python/values.py | 2 +- .../contrib/kernel_methods/python/losses.py | 6 +- .../python/mappers/random_fourier_features.py | 42 +- .../mappers/random_fourier_features_test.py | 2 +- .../contrib/kfac/python/ops/fisher_blocks.py | 82 ++-- .../contrib/lite/build_ios_universal_lib.sh | 15 +- .../contrib/metrics/python/ops/metric_ops.py | 29 +- tensorflow/contrib/rnn/python/ops/rnn_cell.py | 2 +- .../seq2seq/python/ops/attention_wrapper.py | 4 +- tensorflow/contrib/sparsemax/__init__.py | 2 +- .../contrib/sparsemax/python/ops/sparsemax.py | 2 +- .../contrib/tensorrt/convert/convert_graph.cc | 10 +- .../contrib/tensorrt/convert/convert_nodes.cc | 68 ++- .../base_api/api_def_ClipByValue.pbtxt | 36 ++ .../python_api/api_def_ClipByValue.pbtxt | 4 + .../core/common_runtime/process_util.cc | 21 +- tensorflow/core/grappler/optimizers/BUILD | 23 +- tensorflow/core/kernels/BUILD | 2 + tensorflow/core/kernels/cwise_op_abs.cc | 2 - tensorflow/core/kernels/cwise_op_clip.cc | 225 ++++++++++ tensorflow/core/kernels/cwise_op_clip.h | 61 +++ .../core/kernels/cwise_op_clip_gpu.cu.cc | 134 ++++++ tensorflow/core/kernels/maxpooling_op.cc | 93 ++++- .../core/kernels/segment_reduction_ops.h | 6 + tensorflow/core/ops/dataset_ops.cc | 12 +- tensorflow/core/ops/math_ops.cc | 8 + tensorflow/core/platform/macros.h | 9 +- .../docs_src/community/documentation.md | 18 +- tensorflow/docs_src/extend/adding_an_op.md | 159 +++---- .../docs_src/get_started/custom_estimators.md | 2 +- tensorflow/docs_src/install/install_c.md | 2 +- .../docs_src/performance/performance_guide.md | 8 +- .../docs_src/programmers_guide/debugger.md | 57 ++- tensorflow/python/BUILD | 1 + tensorflow/python/framework/dtypes.py | 10 + tensorflow/python/framework/dtypes_test.py | 5 + tensorflow/python/framework/function_test.py | 3 +- tensorflow/python/framework/tensor_shape.py | 3 + .../python/framework/tensor_shape_test.py | 5 + .../keras/_impl/keras/utils/io_utils.py | 14 +- .../python/kernel_tests/clip_ops_test.py | 124 +++++- .../python/kernel_tests/pooling_ops_test.py | 6 - tensorflow/python/ops/clip_ops.py | 30 ++ tensorflow/python/ops/hidden_ops.txt | 395 ++++++++++++++++++ tensorflow/python/util/tf_inspect.py | 43 +- tensorflow/tensorflow.bzl | 53 ++- .../tools/api/generator/create_python_api.py | 3 +- tensorflow/tools/docker/Dockerfile | 2 +- tensorflow/tools/docker/Dockerfile.devel | 2 + tensorflow/tools/docker/Dockerfile.devel-gpu | 2 + tensorflow/tools/docker/Dockerfile.gpu | 2 +- .../notebooks/3_mnist_from_scratch.ipynb | 6 +- .../docker/parameterized_docker_build.sh | 4 +- tensorflow/tools/docs/BUILD | 2 +- tensorflow/tools/docs/build_docs_test.py | 5 - tensorflow/tools/docs/generate_lib.py | 19 +- tensorflow/tools/docs/generate_lib_test.py | 3 - tensorflow/tools/docs/parser.py | 56 ++- tensorflow/tools/docs/parser_test.py | 80 +++- tensorflow/tools/docs/pretty_docs.py | 12 +- tensorflow/tools/docs/py_guide_parser.py | 2 +- tensorflow/workspace.bzl | 13 +- 73 files changed, 1795 insertions(+), 400 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt create mode 100644 tensorflow/core/kernels/cwise_op_clip.cc create mode 100644 tensorflow/core/kernels/cwise_op_clip.h create mode 100644 tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc create mode 100644 tensorflow/python/ops/hidden_ops.txt diff --git a/tensorflow/BUILD b/tensorflow/BUILD index cfafffdd130..f2ad16fa04f 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -450,11 +450,12 @@ tf_cc_shared_object( linkstatic = 1, visibility = ["//visibility:public"], deps = [ - "//tensorflow/core:framework_internal_impl", - "//tensorflow/core:lib_internal_impl", "//tensorflow/core:core_cpu_impl", - "//tensorflow/stream_executor:stream_executor_impl", + "//tensorflow/core:framework_internal_impl", "//tensorflow/core:gpu_runtime_impl", + "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl", + "//tensorflow/core:lib_internal_impl", + "//tensorflow/stream_executor:stream_executor_impl", ] + tf_additional_binary_deps(), ) diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 6edeb7047f9..50fa95c4f32 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -318,6 +318,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", + "//tensorflow/core/kernels:bounds_check", ], ) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 0c9fbf3d545..8e2ee0f1d71 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" +#include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/public/version.h" @@ -441,6 +442,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src, } auto node_name = [&cycles, &graph](int node_id) { + if (!FastBoundsCheck(node_id, graph.num_node_ids())) { + return string("(null)"); + } auto* node = graph.FindNodeId(node_id); if (node == nullptr) { return string("(null)"); diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake index bec8177a3fb..35c2a294ecf 100644 --- a/tensorflow/contrib/cmake/external/grpc.cmake +++ b/tensorflow/contrib/cmake/external/grpc.cmake @@ -35,6 +35,7 @@ else() set(grpc_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a + ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a) endif() diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py index b806799202b..102bc460fda 100644 --- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py +++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py @@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''): #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it #stores String-based info such as name, device and type of the op. #Unique to every Operation instance. - new_node_def = deepcopy(op._node_def) + new_node_def = deepcopy(op.node_def) #Change the name new_node_def.name = new_name @@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''): #Make a copy of the op_def too. #Its unique to every _type_ of Operation. - op_def = deepcopy(op._op_def) + op_def = deepcopy(op.op_def) #Initialize a new Operation instance new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types, diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py index f58e5ec1f03..637b1dc46cb 100644 --- a/tensorflow/contrib/data/__init__.py +++ b/tensorflow/contrib/data/__init__.py @@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview. @@Counter @@SqlDataset +@@assert_element_shape @@batch_and_drop_remainder @@bucket_by_sequence_length @@dense_to_sparse_batch @@ -55,6 +56,7 @@ from __future__ import print_function # pylint: disable=unused-import +from tensorflow.contrib.data.python.ops.batching import assert_element_shape from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch from tensorflow.contrib.data.python.ops.batching import map_and_batch diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index a8481dc90af..b475c9fa6b1 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -21,6 +21,7 @@ py_test( "//tensorflow/python:dtypes", "//tensorflow/python:errors", "//tensorflow/python:math_ops", + "//tensorflow/python:script_ops", "//tensorflow/python:sparse_tensor", "//tensorflow/python:string_ops", "//tensorflow/python:tensor_shape", diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index 75482f67da1..413d8737978 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import script_ops from tensorflow.python.ops import string_ops from tensorflow.python.platform import test @@ -579,5 +581,73 @@ class PaddedBatchDatasetSerializationTest( lambda: build_dataset(seq_lens2), 8) +class RestructuredDatasetTest(test.TestCase): + + def test_assert_element_shape(self): + + def create_unknown_shape_dataset(x): + return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) + + dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset) + unknown_shapes = (tensor_shape.TensorShape(None), + tensor_shape.TensorShape(None)) + self.assertEqual(unknown_shapes, dataset.output_shapes) + + expected_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 4))) + result = dataset.apply(batching.assert_element_shape(expected_shapes)) + self.assertEqual(expected_shapes, result.output_shapes) + + iterator = result.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + for _ in range(5): + sess.run(get_next) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + def test_assert_wrong_element_shape(self): + + def create_dataset(_): + return (array_ops.ones(2, dtype=dtypes.float32), + array_ops.zeros((3, 4), dtype=dtypes.int32)) + + dataset = dataset_ops.Dataset.range(3).map(create_dataset) + wrong_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 10))) + with self.assertRaises(ValueError): + dataset.apply(batching.assert_element_shape(wrong_shapes)) + + def test_assert_wrong_element_shape_on_unknown_shape_dataset(self): + + def create_unknown_shape_dataset(x): + return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) + + dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset) + unknown_shapes = (tensor_shape.TensorShape(None), + tensor_shape.TensorShape(None)) + self.assertEqual(unknown_shapes, dataset.output_shapes) + + wrong_shapes = (tensor_shape.TensorShape(2), + tensor_shape.TensorShape((3, 10))) + iterator = ( + dataset.apply(batching.assert_element_shape(wrong_shapes)) + .make_initializable_iterator()) + init_op = iterator.initializer + get_next = iterator.get_next() + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(errors.InvalidArgumentError): + sess.run(get_next) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py index b044ff17757..d0cb203a3af 100644 --- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py @@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest( # Skip nothing self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10) + def testInvalidSkip(self): + with self.assertRaisesRegexp(ValueError, + 'Shape must be rank 0 but is rank 1'): + self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0) + def _build_take_dataset(self, count): components = (np.arange(10),) return dataset_ops.Dataset.from_tensor_slices(components).take(count) @@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest( # Take nothing self.run_core_tests(lambda: self._build_take_dataset(0), None, 0) + def testInvalidTake(self): + with self.assertRaisesRegexp(ValueError, + 'Shape must be rank 0 but is rank 1'): + self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0) + def _build_repeat_dataset(self, count, take_count=3): components = (np.arange(10),) return dataset_ops.Dataset.from_tensor_slices(components).take( diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index 7c28d1f0059..0e4590829b1 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -112,6 +112,7 @@ py_library( srcs = ["batching.py"], srcs_version = "PY2AND3", deps = [ + "//tensorflow/contrib/framework:framework_py", "//tensorflow/python:array_ops", "//tensorflow/python:dataset_ops_gen", "//tensorflow/python:dtypes", diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index a212adf6cf5..28db949da9e 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.contrib.framework import with_shape from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest from tensorflow.python.data.util import sparse @@ -345,6 +346,46 @@ class _RestructuredDataset(dataset_ops.Dataset): return self._output_shapes +def assert_element_shape(expected_shapes): + """Assert the shape of this `Dataset`. + + ```python + shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)] + result = dataset.apply(tf.contrib.data.assert_element_shape(shapes)) + print(result.output_shapes) # ==> "((16, 256), )" + ``` + + If dataset shapes and expected_shape, are fully defined, assert they match. + Otherwise, add assert op that will validate the shapes when tensors are + evaluated, and set shapes on tensors, respectively. + + Args: + expected_shapes: A nested structure of `tf.TensorShape` objects. + + Returns: + A `Dataset` transformation function, which can be passed to + @{tf.data.Dataset.apply} + """ + + def _check_shape(*elements): + flatten_tensors = nest.flatten(elements) + flatten_shapes = nest.flatten(expected_shapes) + checked_tensors = [ + with_shape(shape, tensor) + for shape, tensor in zip(flatten_shapes, flatten_tensors) + ] + return nest.pack_sequence_as(elements, checked_tensors) + + def _apply_fn(dataset): + return _RestructuredDataset( + dataset.map(_check_shape), + dataset.output_types, + output_shapes=expected_shapes, + output_classes=dataset.output_classes) + + return _apply_fn + + class _MapAndBatchDataset(dataset_ops.MapDataset): """A `Dataset` that maps a function over a batch of elements.""" diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 9acb6a9db93..87bf0590384 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -73,7 +73,7 @@ class DistributedValues(object): @property def devices(self): - return self._index.keys() + return list(self._index.keys()) def __str__(self): return "%s:%s" % (self.__class__.__name__, self._index) diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py index f182fef067b..4ef0a66a524 100644 --- a/tensorflow/contrib/kernel_methods/python/losses.py +++ b/tensorflow/contrib/kernel_methods/python/losses.py @@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss( This is a generalization of standard (binary) hinge loss. For a given instance with correct label c*, the loss is given by: - loss = max_{c != c*} logits_c - logits_{c*} + 1. + $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$ or equivalently - loss = max_c { logits_c - logits_{c*} + I_{c != c*} } - where I_{c != c*} = 1 if c != c* and 0 otherwise. + $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$ + where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise. Args: labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py index 9dc01124ab1..9a721a9d440 100644 --- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py +++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py @@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper): r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow. The RFFM mapping is used to approximate the Gaussian (RBF) kernel: - ``` - exp(-||x-y||_2^2 / (2 * sigma^2)) - ``` + $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$ The implementation of RFFM is based on the following paper: "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht. (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf) - The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D` - where `d` is the input dimension (number of dense input features) and `D` is - the output dimension (i.e., dimension of the feature space the input is mapped - to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian - distribution and each entry of `b` is sampled independently and uniformly from - [0, 2 * pi]. + The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector + \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input + features) and \\(D\\) is the output dimension (i.e., dimension of the feature + space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d. + from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled + independently and uniformly from [0, \\(2 * \pi\\)]. - For a single input feature vector x in R^d, its RFFM is defined as: - ``` - sqrt(2/D) * cos(x * Omega + b) - ``` - where `cos` is the element-wise cosine function and `x, b` are represented as - row vectors. The aforementioned paper shows that the linear kernel of - RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors. + For a single input feature vector \\(x \in R^d\\), its RFFM is defined as: + $$\sqrt(2/D) * cos(x * \Omega + b)$$ + + where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are + represented as row vectors. The aforementioned paper shows that the linear + kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial + vectors. """ def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None): - """Constructs a RandomFourierFeatureMapper instance. + r"""Constructs a RandomFourierFeatureMapper instance. Args: input_dim: The dimension (number of features) of the tensors to be mapped. @@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper): stddev: The standard deviation of the Gaussian kernel to be approximated. The error of the classifier trained using this approximation is very sensitive to this parameter. - seed: An integer used to initialize the parameters (`Omega` and `b`) of - the mapper. For repeatable sequences across different invocations of the - mapper object (for instance, to ensure consistent mapping both at - training and eval/inference if these happen in different invocations), - set this to the same integer. + seed: An integer used to initialize the parameters (\\(\Omega\\) and + \\(b\\)) of the mapper. For repeatable sequences across different + invocations of the mapper object (for instance, to ensure consistent + mapping both at training and eval/inference if these happen in + different invocations), set this to the same integer. name: name for the mapper object. """ # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py index 6f4a2644859..91929184a2e 100644 --- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py +++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py @@ -34,7 +34,7 @@ def _inner_product(x, y): """Inner product between tensors x and y. The input tensors are assumed to be in ROW representation, that is, the method - returns x * y^T. + returns \\(x * y^T\\). Args: x: input tensor in row format diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py index e0d9cb5ea9d..00b3673a742 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py @@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its Fisher Information matrix is given by, - F(params) = E[ v(x, y, params) v(x, y, params)^T ] + $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$ where, - v(x, y, params) = (d / d params) log p(y | x, params) + $$v(x, y, params) = (d / d params) log p(y | x, params)$$ and the expectation is taken with respect to the data's distribution for 'x' and the model's posterior distribution for 'y', @@ -85,7 +85,7 @@ def normalize_damping(damping, num_replications): def compute_pi_tracenorm(left_cov, right_cov): """Computes the scalar constant pi for Tikhonov regularization/damping. - pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) ) + $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$ See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details. Args: @@ -462,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock): Let 'params' be a vector parameterizing a model and 'i' an arbitrary index into it. We are interested in Fisher(params)[i, i]. This is, - Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i] - = E[ v(x, y, params)[i] ^ 2 ] + $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i] + = E[ v(x, y, params)[i] ^ 2 ]$$ Consider fully connected layer in this model with (unshared) weight matrix 'w'. For an example 'x' that produces layer inputs 'a' and output preactivations 's', - v(x, y, w) = vec( a (d loss / d s)^T ) + $$v(x, y, w) = vec( a (d loss / d s)^T )$$ This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding to the layer's parameters 'w'. @@ -532,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiTower, FisherBlock): Let 'params' be a vector parameterizing a model and 'i' an arbitrary index into it. We are interested in Fisher(params)[i, i]. This is, - Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i] - = E[ v(x, y, params)[i] ^ 2 ] + $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i] + = E[ v(x, y, params)[i] ^ 2 ]$$ Consider a convoluational layer in this model with (unshared) filter matrix 'w'. For an example image 'x' that produces layer inputs 'a' and output preactivations 's', - v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T ) + $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$ where 'loc' is a single (x, y) location in an image. @@ -805,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB): 'w'. For a minibatch that produces inputs 'a' and output preactivations 's', this FisherBlock estimates, - F(w) = #locations * kronecker(E[flat(a) flat(a)^T], - E[flat(ds) flat(ds)^T]) + $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T], + E[flat(ds) flat(ds)^T])$$ where - ds = (d / ds) log p(y | x, w) + $$ds = (d / ds) log p(y | x, w)$$ #locations = number of (x, y) locations where 'w' is applied. where the expectation is taken over all examples and locations and flat() @@ -1567,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse, if self._option == SeriesFBApproximation.option1: - # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G. + # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\) L_A, psi_A = self._input_factor.get_option1quants( self._input_damping_func) L_G, psi_G = self._output_factor.get_option1quants( @@ -1581,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse, T = self._num_timesteps return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T)) - # Y = gamma( psi_G*psi_A^T ) (computed element-wise) + # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise) # Even though Y is Z-independent we are recomputing it from the psi's # each since Y depends on both A and G quantities, and it is relatively # cheap to compute. Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A) - # Z = L_G^T * Z * L_A + # \\(Z = L_G^T * Z * L_A\\) # This is equivalent to the following computation from the original # pseudo-code: - # Z = G0^(-1/2) * Z * A0^(-1/2) - # Z = U_G^T * Z * U_A + # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\) + # \\(Z = U_G^T * Z * U_A\\) Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True) - # Z = Z .* Y + # \\(Z = Z .* Y\\) Z *= Y - # Z = L_G * Z * L_A^T + # \\(Z = L_G * Z * L_A^T\\) # This is equivalent to the following computation from the original # pseudo-code: - # Z = U_G * Z * U_A^T - # Z = G0^(-1/2) * Z * A0^(-1/2) + # \\(Z = U_G * Z * U_A^T\\) + # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\) Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True)) elif self._option == SeriesFBApproximation.option2: - # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1), - # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G. + # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\), + # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\) P_A, K_A, mu_A = self._input_factor.get_option2quants( self._input_damping_func) P_G, K_G, mu_G = self._output_factor.get_option2quants( @@ -1616,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse, # Our approach differs superficially from the pseudo-code in the paper # in order to reduce the total number of matrix-matrix multiplies. # In particular, the first three computations in the pseudo code are - # Z = G0^(-1/2) * Z * A0^(-1/2) - # Z = Z - hPsi_G^T * Z * hPsi_A - # Z = E_G^T * Z * E_A - # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that - # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2) + # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\) + # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\) + # \\(Z = E_G^T * Z * E_A\\) + # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that + # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\) # the entire computation can be written as - # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2) - # - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A - # = E_G^T * (G0^(-1/2) * Z * A0^(-1/2) - # - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A - # = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A - # - E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A - # = K_G^T * Z * K_A - K_G^T * P_G * Z * P_A^T * K_A + # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\) + # \\( - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\) + # \\( = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\) + # \\( - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\) + # \\( = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\) + # \\( - E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\) + # \\( = K_G^T * Z * K_A - K_G^T * P_G * Z * P_A^T * K_A\\) # This final expression is computed by the following two lines: - # Z = Z - P_G * Z * P_A^T + # \\(Z = Z - P_G * Z * P_A^T\\) Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True)) - # Z = K_G^T * Z * K_A + # \\(Z = K_G^T * Z * K_A\\) Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True) - # Z = Z ./ (1*1^T - mu_G*mu_A^T) + # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\) # Be careful with the outer product. We don't want to accidentally # make it an inner-product instead. tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A @@ -1646,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse, # We now perform the transpose/reverse version of the operations # derived above, whose derivation from the original pseudo-code is # analgous. - # Z = K_G * Z * K_A^T + # \\(Z = K_G * Z * K_A^T\\) Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True)) - # Z = Z - P_G^T * Z * P_A + # \\(Z = Z - P_G^T * Z * P_A\\) Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True) - # Z = normalize (1/E[T]) * Z + # \\(Z = normalize (1/E[T]) * Z\\) # Note that this normalization is done because we compute the statistics # by averaging, not summing, over time. (And the gradient is presumably # summed over time, not averaged, and thus their scales are different.) diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh index 4a9023ff33d..9f398f4a9f3 100755 --- a/tensorflow/contrib/lite/build_ios_universal_lib.sh +++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh @@ -19,11 +19,16 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR/../../.." -make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 -make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 -make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 -make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 -make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 +make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \ +$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a +make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \ +$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a +make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \ +$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a +make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \ +$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a +make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \ +$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a lipo \ tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \ diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index 81f05e7ce58..9c8ae48094e 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -63,6 +63,8 @@ def _safe_div(numerator, denominator, name): name=name) +@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the ' + 'order of the labels and predictions arguments has been switched.') def streaming_true_positives(predictions, labels, weights=None, @@ -107,6 +109,8 @@ def streaming_true_positives(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the ' + 'order of the labels and predictions arguments has been switched.') def streaming_true_negatives(predictions, labels, weights=None, @@ -151,6 +155,8 @@ def streaming_true_negatives(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the ' + 'order of the labels and predictions arguments has been switched.') def streaming_false_positives(predictions, labels, weights=None, @@ -195,6 +201,8 @@ def streaming_false_positives(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the ' + 'order of the labels and predictions arguments has been switched.') def streaming_false_negatives(predictions, labels, weights=None, @@ -238,6 +246,7 @@ def streaming_false_negatives(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.mean') def streaming_mean(values, weights=None, metrics_collections=None, @@ -287,6 +296,7 @@ def streaming_mean(values, name=name) +@deprecated(None, 'Please switch to tf.metrics.mean_tensor') def streaming_mean_tensor(values, weights=None, metrics_collections=None, @@ -340,9 +350,8 @@ def streaming_mean_tensor(values, name=name) -@deprecated(None, - 'Please switch to tf.metrics.accuracy. Note that the order of the ' - 'labels and predictions arguments has been switched.') +@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order ' + 'of the labels and predictions arguments has been switched.') def streaming_accuracy(predictions, labels, weights=None, @@ -400,6 +409,8 @@ def streaming_accuracy(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order ' + 'of the labels and predictions arguments has been switched.') def streaming_precision(predictions, labels, weights=None, @@ -456,6 +467,8 @@ def streaming_precision(predictions, name=name) +@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order ' + 'of the labels and predictions arguments has been switched.') def streaming_recall(predictions, labels, weights=None, @@ -975,8 +988,8 @@ def streaming_curve_points(labels=None, return points, update_op -@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the ' - 'labels and predictions arguments has been switched.') +@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of ' + 'the labels and predictions arguments has been switched.') def streaming_auc(predictions, labels, weights=None, @@ -1797,9 +1810,9 @@ def streaming_sensitivity_at_specificity(predictions, name=name) -@deprecated( - None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the ' - 'order of the labels and predictions arguments has been switched.') +@deprecated(None, + 'Please switch to tf.metrics.precision_at_thresholds. Note that ' + 'the order of the labels and predictions arguments are switched.') def streaming_precision_at_thresholds(predictions, labels, thresholds, diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py index 2f6ae9f3678..b12e2cd5edd 100644 --- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py +++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py @@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell): output_size = weight.get_shape().as_list()[1] g = vs.get_variable(name, [output_size], dtype=weight.dtype) - return nn_impl.l2_normalize(weight, dim=0) * g + return nn_impl.l2_normalize(weight, axis=0) * g def _linear(self, args, diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py index 9e0d69593f8..f0f143ddfcf 100644 --- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py +++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py @@ -610,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode): addition, once an input sequence element is attended to at a given output timestep, elements occurring before it cannot be attended to at subsequent output timesteps. This function generates attention distributions according - to these assumptions. For more information, see ``Online and Linear-Time - Attention by Enforcing Monotonic Alignments''. + to these assumptions. For more information, see `Online and Linear-Time + Attention by Enforcing Monotonic Alignments`. Args: p_choose_i: Probability of choosing input sequence/memory element i. Should diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py index 19d213fb3e8..7bc726f4a84 100644 --- a/tensorflow/contrib/sparsemax/__init__.py +++ b/tensorflow/contrib/sparsemax/__init__.py @@ -14,7 +14,7 @@ # ============================================================================== """Module that implements sparsemax and sparsemax loss, see [1]. -[1] https://arxiv.org/abs/1602.02068 +[1]: https://arxiv.org/abs/1602.02068 ## Sparsemax diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py index 890ca20f4ca..e617af2ff1b 100644 --- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py +++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py @@ -31,7 +31,7 @@ def sparsemax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have - sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0) + $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$ [1]: https://arxiv.org/abs/1602.02068 diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index ff8cc6374d4..b412b296e02 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -405,7 +405,13 @@ tensorflow::Status ConvertGraphDefToTensorRT( max_mem_per_engine, static_graph_properties, &output_edge_map, precision_mode); if (precision_mode == INT8MODE) { - TF_RETURN_IF_ERROR(GetCalibNode(&p)); + tensorflow::Status status = GetCalibNode(&p); + if (status != tensorflow::Status::OK()) { + LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count + << " due to: \"" << status.ToString() + << "\" SKIPPING......( " << subgraph_node_names.size() + << " nodes)"; + } } else { tensorflow::Status status = ConvertSubGraphToTensorRT(&p); if (status != tensorflow::Status::OK()) { @@ -414,8 +420,8 @@ tensorflow::Status ConvertGraphDefToTensorRT( << "\" SKIPPING......( " << subgraph_node_names.size() << " nodes)"; } - count++; } + count++; } graph.ToGraphDef(new_graph_def); return tensorflow::Status::OK(); diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index e920a797fe4..b81ae9dc3ee 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -443,7 +443,9 @@ class Converter { * 2) Control dependency inputs contain caret at the beginning and we * remove this and annotate the edge as a control dependency. ************************************************************************/ - string name = input_name[0] == '^' ? input_name.substr(1) : input_name; + // skip control nodes + if (input_name[0] == '^') continue; + string name = input_name; auto first = name.find_first_of(':'); if (first != string::npos && first + 2 == name.size() && name[first + 1] == '0') @@ -2262,6 +2264,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { auto ws = new tensorflow::tensorrt::TRTWeightStore(); TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws)); Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE); + std::vector input_names; std::vector input_dtypes; for (const std::pair& input : s.input_inds) { @@ -2270,20 +2273,41 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { int output_idx = input.second; tensorflow::Node* node = s.graph.FindNodeId(node_id); auto node_name = node->name(); - input_names.push_back(node_name); // insert original node name without port - // TODO(jie): alternative :) - if (!s.graph_properties.HasOutputProperties(node_name)) + // input_names should use the node name in the graph + // here it should be the input tensor name -> matching the binding + // insert original node name without port + auto tensor_name = node_name; + if (output_idx != 0) { + tensor_name = StrCat(tensor_name, ":", output_idx); + } + + VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name + << " idx: " << output_idx; + + auto shape_inference_node_name = node_name; + auto shape_inference_output_idx = output_idx; + // rewire the shape inference to original node in the graph + if (s.output_edge_map->count(tensor_name)) { + shape_inference_node_name = s.output_edge_map->at(tensor_name).second; + shape_inference_output_idx = s.output_edge_map->at(tensor_name).first; + } + if (shape_inference_output_idx < 0) continue; + VLOG(2) << "shapeinference name: " << shape_inference_node_name + << " idx: " << shape_inference_output_idx; + + if (!s.graph_properties.HasOutputProperties(shape_inference_node_name)) return tensorflow::errors::Internal("failed to find input node: " + - node_name); + shape_inference_node_name); - auto op_info_vec = s.graph_properties.GetOutputProperties(node_name); - if (static_cast(op_info_vec.size()) < output_idx) + auto op_info_vec = + s.graph_properties.GetOutputProperties(shape_inference_node_name); + if (static_cast(op_info_vec.size()) <= shape_inference_output_idx) return tensorflow::errors::Internal( - "accessing output index of: ", output_idx, ", at node: ", node_name, - "with output entry from shape_map: ", op_info_vec.size()); - - auto op_info = op_info_vec.at(output_idx); + "accessing output index of: ", shape_inference_output_idx, + ", at node: ", shape_inference_node_name, + " with output entry from shape_map: ", op_info_vec.size()); + auto op_info = op_info_vec.at(shape_inference_output_idx); tensorflow::DataType tf_dtype = op_info.dtype(); input_dtypes.push_back(tf_dtype); @@ -2294,16 +2318,23 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { << "' failed"; return type_status; } - TF_CHECK_OK(ConvertDType(tf_dtype, &dtype)); VLOG(2) << "accessing output index of: " << output_idx << ", at node: " << node_name << "with output entry from shape_map: " << op_info_vec.size(); - // TODO(ben,jie): update TRT input format/dimension nvinfer1::DimsCHW input_dim_psuedo_chw; for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1; + // TODO(jie): TRT 3.x only support 4 dimensional input tensor. + // update the code once TRT 4.0 comes out. + if (op_info.shape().dim_size() != 4) { + string err_str = "Require 4 dimensional input."; + StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ", + shape_inference_node_name); + return tensorflow::errors::Unimplemented(err_str); + } + for (int i = 1; i < op_info.shape().dim_size(); i++) { VLOG(2) << "dimension: " << i << " , size: " << op_info.shape().dim(i).size(); @@ -2312,8 +2343,11 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { // TODO(ben,jie): proper way to restore input tensor name? auto input_tensor_name = node_name; - if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx); + if (output_idx != 0) { + input_tensor_name = StrCat(node_name, ":", output_idx); + } + input_names.push_back(input_tensor_name); nvinfer1::ITensor* input_tensor = converter.network()->addInput( input_tensor_name.c_str(), dtype, input_dim_psuedo_chw); @@ -2377,11 +2411,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) { tensor->setType(trt_dtype); } - VLOG(2) << "finished output"; + VLOG(2) << "Finished processing outputs"; // Build the engine op_res->builder_->setMaxBatchSize(s.max_batch_size); op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes); + VLOG(0) << "Max batch size= " << s.max_batch_size + << " max workspace size= " << s.max_workspace_size_bytes; // Build the TRT op // TODO(sami,ben,jie): proper naming! @@ -2475,7 +2511,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef( std::vector input_names; std::vector input_dtypes; for (const std::pair& input : s.input_inds) { - VLOG(2) << "parsing input!!!!!"; + VLOG(2) << "parsing input. Node id= " << input.first; int node_id = input.first; int output_idx = input.second; tensorflow::Node* node = s.graph.FindNodeId(node_id); diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt new file mode 100644 index 00000000000..803d8970ab7 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt @@ -0,0 +1,36 @@ +op { + graph_op_name: "ClipByValue" + in_arg { + name: "t" + description: <
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.8.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.7.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.7.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.6.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow-1.0.0CPU3.5MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.0.0GPU3.5MSVC 2015 update 3Cmake v3.6.35.18
+ + +## Build the C or Java libraries + +The instructions above are tailored to building the TensorFlow Python packages. + +If you're interested in building the libraries for the TensorFlow C API, do the +following: + +1. Follow the steps up to [Configure the installation](#ConfigureInstallation) +2. Build the C libraries following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md). + +If you're interested inv building the libraries for the TensorFlow Java API, +do the following: + +1. Follow the steps up to [Configure the installation](#ConfigureInstallation) +2. Build the Java library following instructions in the [README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/lib_package/README.md). From d218339e6a05a984ef7b9a49d66db219d862936e Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Thu, 19 Apr 2018 01:26:07 -0700 Subject: [PATCH 0409/1734] Remove proto import in header files for core/kernels/boosted_trees. Move implementations that requires declaration of TreeEnsemble to .cc files. The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so import PiperOrigin-RevId: 193478404 --- .../core/kernels/boosted_trees/resources.cc | 138 ++++++++++++++++++ .../core/kernels/boosted_trees/resources.h | 128 ++++------------ 2 files changed, 165 insertions(+), 101 deletions(-) diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc index 2ea12c522c8..c410748c27e 100644 --- a/tensorflow/core/kernels/boosted_trees/resources.cc +++ b/tensorflow/core/kernels/boosted_trees/resources.cc @@ -21,6 +21,35 @@ limitations under the License. namespace tensorflow { +// Constructor. +BoostedTreesEnsembleResource::BoostedTreesEnsembleResource() + : tree_ensemble_( + protobuf::Arena::CreateMessage( + &arena_)) {} + +string BoostedTreesEnsembleResource::DebugString() { + return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(), + "]"); +} + +bool BoostedTreesEnsembleResource::InitFromSerialized(const string& serialized, + const int64 stamp_token) { + CHECK_EQ(stamp(), -1) << "Must Reset before Init."; + if (ParseProtoUnlimited(tree_ensemble_, serialized)) { + set_stamp(stamp_token); + return true; + } + return false; +} + +string BoostedTreesEnsembleResource::SerializeAsString() const { + return tree_ensemble_->SerializeAsString(); +} + +int32 BoostedTreesEnsembleResource::num_trees() const { + return tree_ensemble_->trees_size(); +} + int32 BoostedTreesEnsembleResource::next_node( const int32 tree_id, const int32 node_id, const int32 index_in_batch, const std::vector::ConstVec>& bucketized_features) const { @@ -49,6 +78,115 @@ float BoostedTreesEnsembleResource::node_value(const int32 tree_id, } } +int32 BoostedTreesEnsembleResource::GetNumLayersGrown( + const int32 tree_id) const { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->tree_metadata(tree_id).num_layers_grown(); +} + +void BoostedTreesEnsembleResource::SetNumLayersGrown( + const int32 tree_id, int32 new_num_layers) const { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown( + new_num_layers); +} + +void BoostedTreesEnsembleResource::UpdateLastLayerNodesRange( + const int32 node_range_start, int32 node_range_end) const { + tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start( + node_range_start); + tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end( + node_range_end); +} + +void BoostedTreesEnsembleResource::GetLastLayerNodesRange( + int32* node_range_start, int32* node_range_end) const { + *node_range_start = + tree_ensemble_->growing_metadata().last_layer_node_start(); + *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end(); +} + +int64 BoostedTreesEnsembleResource::GetNumNodes(const int32 tree_id) { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->trees(tree_id).nodes_size(); +} + +int32 BoostedTreesEnsembleResource::GetNumLayersAttempted() { + return tree_ensemble_->growing_metadata().num_layers_attempted(); +} + +bool BoostedTreesEnsembleResource::is_leaf(const int32 tree_id, + const int32 node_id) const { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size()); + const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id); + return node.node_case() == boosted_trees::Node::kLeaf; +} + +int32 BoostedTreesEnsembleResource::feature_id(const int32 tree_id, + const int32 node_id) const { + const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); + DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); + return node.bucketized_split().feature_id(); +} + +int32 BoostedTreesEnsembleResource::bucket_threshold( + const int32 tree_id, const int32 node_id) const { + const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); + DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); + return node.bucketized_split().threshold(); +} + +int32 BoostedTreesEnsembleResource::left_id(const int32 tree_id, + const int32 node_id) const { + const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); + DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); + return node.bucketized_split().left_id(); +} + +int32 BoostedTreesEnsembleResource::right_id(const int32 tree_id, + const int32 node_id) const { + const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); + DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); + return node.bucketized_split().right_id(); +} + +std::vector BoostedTreesEnsembleResource::GetTreeWeights() const { + return {tree_ensemble_->tree_weights().begin(), + tree_ensemble_->tree_weights().end()}; +} + +float BoostedTreesEnsembleResource::GetTreeWeight(const int32 tree_id) const { + return tree_ensemble_->tree_weights(tree_id); +} + +float BoostedTreesEnsembleResource::IsTreeFinalized(const int32 tree_id) const { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->tree_metadata(tree_id).is_finalized(); +} + +float BoostedTreesEnsembleResource::IsTreePostPruned( + const int32 tree_id) const { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size() > + 0; +} + +void BoostedTreesEnsembleResource::SetIsFinalized(const int32 tree_id, + const bool is_finalized) { + DCHECK_LT(tree_id, tree_ensemble_->trees_size()); + return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized( + is_finalized); +} + +// Sets the weight of i'th tree. +void BoostedTreesEnsembleResource::SetTreeWeight(const int32 tree_id, + const float weight) { + DCHECK_GE(tree_id, 0); + DCHECK_LT(tree_id, num_trees()); + tree_ensemble_->set_tree_weights(tree_id, weight); +} + void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const { tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted( tree_ensemble_->growing_metadata().num_layers_attempted() + 1); diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h index 561ca3a18a7..df78d3f275b 100644 --- a/tensorflow/core/kernels/boosted_trees/resources.h +++ b/tensorflow/core/kernels/boosted_trees/resources.h @@ -17,12 +17,16 @@ limitations under the License. #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_ #include "tensorflow/core/framework/resource_mgr.h" -#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/protobuf.h" namespace tensorflow { +// Forward declaration for proto class TreeEnsemble +namespace boosted_trees { +class TreeEnsemble; +} // namespace boosted_trees + // A StampedResource is a resource that has a stamp token associated with it. // Before reading from or applying updates to the resource, the stamp should // be checked to verify that the update is not stale. @@ -42,31 +46,15 @@ class StampedResource : public ResourceBase { // Keep a tree ensemble in memory for efficient evaluation and mutation. class BoostedTreesEnsembleResource : public StampedResource { public: - // Constructor. - BoostedTreesEnsembleResource() - : tree_ensemble_( - protobuf::Arena::CreateMessage( - &arena_)) {} + BoostedTreesEnsembleResource(); - string DebugString() override { - return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(), - "]"); - } + string DebugString() override; - bool InitFromSerialized(const string& serialized, const int64 stamp_token) { - CHECK_EQ(stamp(), -1) << "Must Reset before Init."; - if (ParseProtoUnlimited(tree_ensemble_, serialized)) { - set_stamp(stamp_token); - return true; - } - return false; - } + bool InitFromSerialized(const string& serialized, const int64 stamp_token); - string SerializeAsString() const { - return tree_ensemble_->SerializeAsString(); - } + string SerializeAsString() const; - int32 num_trees() const { return tree_ensemble_->trees_size(); } + int32 num_trees() const; // Find the next node to which the example (specified by index_in_batch) // traverses down from the current node indicated by tree_id and node_id. @@ -82,73 +70,31 @@ class BoostedTreesEnsembleResource : public StampedResource { float node_value(const int32 tree_id, const int32 node_id) const; - int32 GetNumLayersGrown(const int32 tree_id) const { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->tree_metadata(tree_id).num_layers_grown(); - } + int32 GetNumLayersGrown(const int32 tree_id) const; - void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown( - new_num_layers); - } + void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const; void UpdateLastLayerNodesRange(const int32 node_range_start, - int32 node_range_end) const { - tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start( - node_range_start); - tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end( - node_range_end); - } + int32 node_range_end) const; void GetLastLayerNodesRange(int32* node_range_start, - int32* node_range_end) const { - *node_range_start = - tree_ensemble_->growing_metadata().last_layer_node_start(); - *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end(); - } + int32* node_range_end) const; - int64 GetNumNodes(const int32 tree_id) { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->trees(tree_id).nodes_size(); - } + int64 GetNumNodes(const int32 tree_id); void UpdateGrowingMetadata() const; - int32 GetNumLayersAttempted() { - return tree_ensemble_->growing_metadata().num_layers_attempted(); - } + int32 GetNumLayersAttempted(); - bool is_leaf(const int32 tree_id, const int32 node_id) const { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size()); - const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id); - return node.node_case() == boosted_trees::Node::kLeaf; - } + bool is_leaf(const int32 tree_id, const int32 node_id) const; - int32 feature_id(const int32 tree_id, const int32 node_id) const { - const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); - DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); - return node.bucketized_split().feature_id(); - } + int32 feature_id(const int32 tree_id, const int32 node_id) const; - int32 bucket_threshold(const int32 tree_id, const int32 node_id) const { - const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); - DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); - return node.bucketized_split().threshold(); - } + int32 bucket_threshold(const int32 tree_id, const int32 node_id) const; - int32 left_id(const int32 tree_id, const int32 node_id) const { - const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); - DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); - return node.bucketized_split().left_id(); - } + int32 left_id(const int32 tree_id, const int32 node_id) const; - int32 right_id(const int32 tree_id, const int32 node_id) const { - const auto node = tree_ensemble_->trees(tree_id).nodes(node_id); - DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit); - return node.bucketized_split().right_id(); - } + int32 right_id(const int32 tree_id, const int32 node_id) const; // Add a tree to the ensemble and returns a new tree_id. int32 AddNewTree(const float weight); @@ -163,38 +109,18 @@ class BoostedTreesEnsembleResource : public StampedResource { // Retrieves tree weights and returns as a vector. // It involves a copy, so should be called only sparingly (like once per // iteration, not per example). - std::vector GetTreeWeights() const { - return {tree_ensemble_->tree_weights().begin(), - tree_ensemble_->tree_weights().end()}; - } + std::vector GetTreeWeights() const; - float GetTreeWeight(const int32 tree_id) const { - return tree_ensemble_->tree_weights(tree_id); - } + float GetTreeWeight(const int32 tree_id) const; - float IsTreeFinalized(const int32 tree_id) const { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->tree_metadata(tree_id).is_finalized(); - } + float IsTreeFinalized(const int32 tree_id) const; - float IsTreePostPruned(const int32 tree_id) const { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->tree_metadata(tree_id) - .post_pruned_nodes_meta_size() > 0; - } + float IsTreePostPruned(const int32 tree_id) const; - void SetIsFinalized(const int32 tree_id, const bool is_finalized) { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized( - is_finalized); - } + void SetIsFinalized(const int32 tree_id, const bool is_finalized); // Sets the weight of i'th tree. - void SetTreeWeight(const int32 tree_id, const float weight) { - DCHECK_GE(tree_id, 0); - DCHECK_LT(tree_id, num_trees()); - tree_ensemble_->set_tree_weights(tree_id, weight); - } + void SetTreeWeight(const int32 tree_id, const float weight); // Resets the resource and frees the protos in arena. // Caller needs to hold the mutex lock while calling this. From b2536f05bb156612c96f204041ea31980b711fc8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 01:56:31 -0700 Subject: [PATCH 0410/1734] Update feature_util's GetFeatures to show compile-time error for unsupported types instead of a link-time error. PiperOrigin-RevId: 193480683 --- tensorflow/core/example/feature_util.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h index d977935b8a3..2265498b5e2 100644 --- a/tensorflow/core/example/feature_util.h +++ b/tensorflow/core/example/feature_util.h @@ -182,13 +182,25 @@ struct FeatureTrait< // Returns true if sequence_example has a feature_list with the specified key. bool HasFeatureList(const string& key, const SequenceExample& sequence_example); +template +struct TypeHasFeatures : std::false_type {}; + +template <> +struct TypeHasFeatures : std::true_type {}; + +template <> +struct TypeHasFeatures : std::true_type {}; + // A family of template functions to return mutable Features proto from a // container proto. Supported ProtoTypes: Example, Features. template -Features* GetFeatures(ProtoType* proto); +typename std::enable_if::value, Features*>::type +GetFeatures(ProtoType* proto); template -const Features& GetFeatures(const ProtoType& proto); +typename std::enable_if::value, + const Features&>::type +GetFeatures(const ProtoType& proto); // Base declaration of a family of template functions to return a read only // repeated field of feature values. @@ -300,7 +312,7 @@ bool HasFeature(const string& key, const Features& features); template bool HasFeature(const string& key, const Example& example) { return HasFeature(key, GetFeatures(example)); -}; +} // DEPRECATED: use HasFeature instead. // TODO(gorban): update all clients in a followup CL. From 5fb3c64421f53aa7ef58ffcee6de47cd4a40fe2d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 02:58:31 -0700 Subject: [PATCH 0411/1734] Set the random seed in on-demand mode. PiperOrigin-RevId: 193488103 --- tensorflow/compiler/jit/xla_compile_on_demand_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index 682d6ea8ccc..6c2782e28e9 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -67,6 +67,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, run_options.set_stream(stream); run_options.set_allocator(client->backend().memory_allocator()); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); + run_options.set_rng_seed(ctx->step_id()); auto run_result = executable->Run(launch_context.arguments(), run_options); TF_RETURN_IF_ERROR(run_result.status()); From bf86d3a46b4e2ef4dabcba211c1ce36cb81ac315 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 04:27:38 -0700 Subject: [PATCH 0412/1734] Handle corner case in Python 3: members annotated with @classmethod. PiperOrigin-RevId: 193495506 --- tensorflow/contrib/autograph/pyct/inspect_utils.py | 12 +++++++----- .../contrib/autograph/pyct/inspect_utils_test.py | 7 +++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py index a0f56a6c1f8..eef74599a7d 100644 --- a/tensorflow/contrib/autograph/pyct/inspect_utils.py +++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py @@ -75,13 +75,15 @@ def getdefiningclass(m, owner_class): """Resolves the class (e.g. one of the superclasses) that defined a method.""" # Normalize bound functions to their respective unbound versions. m = _get_unbound_function(m) - last_defining = owner_class - for superclass in tf_inspect.getmro(owner_class): + for superclass in owner_class.__bases__: if hasattr(superclass, m.__name__): superclass_m = getattr(superclass, m.__name__) - if _get_unbound_function(superclass_m) == m: - last_defining = superclass - return last_defining + if _get_unbound_function(superclass_m) is m: + return superclass + elif hasattr(m, '__self__') and m.__self__ == owner_class: + # Python 3 class methods only work this way it seems :S + return superclass + return owner_class def getmethodclass(m): diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py index cf841dae814..1a212f676a6 100644 --- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py +++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py @@ -243,6 +243,10 @@ class InspectUtilsTest(test.TestCase): def bar(self): pass + @classmethod + def class_method(cls): + pass + class Subclass(Superclass): def foo(self): @@ -257,6 +261,9 @@ class InspectUtilsTest(test.TestCase): inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass) self.assertTrue( inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass) + self.assertTrue( + inspect_utils.getdefiningclass(Subclass.class_method, Subclass) is + Superclass) def test_isbuiltin(self): self.assertTrue(inspect_utils.isbuiltin(range)) From 06d802ab61987bde76a30098ff7930c27d561375 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 05:11:30 -0700 Subject: [PATCH 0413/1734] Support for converting entire class hierarchies: * limit the methods being converted to those that have not been inherited from the superclass * include the (possibly compiled) superclass in the definition of the compiled class * either mark the superclass for conversion or generate an absolute aliased import line, depending on whether it's whitelisted PiperOrigin-RevId: 193499204 --- .../autograph/converters/call_trees.py | 10 ++-- tensorflow/contrib/autograph/impl/api.py | 2 +- .../contrib/autograph/impl/conversion.py | 58 +++++++++++++++--- .../contrib/autograph/impl/conversion_test.py | 60 +++++++++++++++++++ 4 files changed, 117 insertions(+), 13 deletions(-) diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py index e390d1a262b..2e5590b46cd 100644 --- a/tensorflow/contrib/autograph/converters/call_trees.py +++ b/tensorflow/contrib/autograph/converters/call_trees.py @@ -245,8 +245,6 @@ class CallTreeTransformer(transformer.Base): new_call.keywords = node.keywords return new_call - # pylint:disable=invalid-name - def visit_Expr(self, node): if isinstance(node.value, gast.Call): if anno.hasanno(node.value.func, 'live_val'): @@ -294,15 +292,17 @@ class CallTreeTransformer(transformer.Base): raise NotImplementedError( 'py_func with return values (unknown function)') else: - if self.context.recursive: + if ast_util.matches(node, 'super(_)'): + # super() calls are preserved. The class conversion mechanism will + # ensure that they return the correct value. + pass + elif self.context.recursive: node = self._insert_dynamic_conversion(node) else: # Unresolved functions are allowed in non-recursive mode. pass return node - # pylint:enable=invalid-name - def transform(node, context, uncompiled_modules, nocompile_decorators): """Transform function call to the compiled counterparts. diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py index f97a33326ec..d874ef15c93 100644 --- a/tensorflow/contrib/autograph/impl/api.py +++ b/tensorflow/contrib/autograph/impl/api.py @@ -241,7 +241,7 @@ def to_graph(e, module = gast.Module([]) for import_line in config.COMPILED_IMPORT_STATEMENTS: module.body.extend(parser.parse_str(import_line).body) - for dep in conversion_map.dependency_cache.values(): + for dep in reversed(conversion_map.dependency_cache.values()): module.body.append(dep) compiled_node, compiled_src = compiler.ast_to_object(module) diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py index 5653e991f60..e7230a5f450 100644 --- a/tensorflow/contrib/autograph/impl/conversion.py +++ b/tensorflow/contrib/autograph/impl/conversion.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import imp import gast @@ -39,6 +40,7 @@ from tensorflow.contrib.autograph.converters import side_effect_guards from tensorflow.contrib.autograph.converters import single_return from tensorflow.contrib.autograph.impl import config from tensorflow.contrib.autograph.impl import naming +from tensorflow.contrib.autograph.pyct import ast_util from tensorflow.contrib.autograph.pyct import context from tensorflow.contrib.autograph.pyct import inspect_utils from tensorflow.contrib.autograph.pyct import parser @@ -81,7 +83,9 @@ class ConversionMap(object): self.recursive = recursive self.nocompile_decorators = nocompile_decorators self.partial_types = partial_types if partial_types else () - self.dependency_cache = {} + # Required to output dependencies in discovery order, which should match + # the reverse dependency order. + self.dependency_cache = collections.OrderedDict() self.additional_imports = set() self.name_map = {} self.api_module = api_module @@ -201,6 +205,9 @@ def class_to_graph(c, conversion_map): class_namespace = {} for _, m in members: + # Only convert the members that are directly defined by the class. + if inspect_utils.getdefiningclass(m, c) is not c: + continue node, _, namespace = function_to_graph( m, conversion_map=conversion_map, @@ -214,12 +221,49 @@ def class_to_graph(c, conversion_map): converted_members[m] = node namer = conversion_map.new_namer(class_namespace) class_name = namer.compiled_class_name(c.__name__, c) - node = gast.ClassDef( - class_name, - bases=[], - keywords=[], - body=list(converted_members.values()), - decorator_list=[]) + + # TODO(mdan): This needs to be explained more thoroughly. + # Process any base classes: if the sueprclass if of a whitelisted type, an + # absolute import line is generated. Otherwise, it is marked for conversion + # (as a side effect of the call to namer.compiled_class_name() followed by + # conversion_map.update_name_map(namer)). + output_nodes = [] + renames = {} + bases = [] + for base in c.__bases__: + if isinstance(object, base): + bases.append('object') + continue + if is_whitelisted_for_graph(base): + alias = namer.new_symbol(base.__name__, ()) + output_nodes.append( + gast.ImportFrom( + module=base.__module__, + names=[gast.alias(name=base.__name__, asname=alias)], + level=0)) + else: + # This will trigger a conversion into a class with this name. + alias = namer.compiled_class_name(base.__name__, base) + bases.append(alias) + renames[qual_names.QN(base.__name__)] = qual_names.QN(alias) + conversion_map.update_name_map(namer) + + # Generate the definition of the converted class. + output_nodes.append( + gast.ClassDef( + class_name, + bases=bases, + keywords=[], + body=list(converted_members.values()), + decorator_list=[])) + node = gast.Module(output_nodes) + + # Make a final pass to replace references to the class or its base classes. + # Most commonly, this occurs when making super().__init__() calls. + # TODO(mdan): Making direct references to superclass' superclass will fail. + node = qual_names.resolve(node) + renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name) + node = ast_util.rename_symbols(node, renames) return node, class_name, class_namespace diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py index da3220892f2..5edd8e74a88 100644 --- a/tensorflow/contrib/autograph/impl/conversion_test.py +++ b/tensorflow/contrib/autograph/impl/conversion_test.py @@ -24,6 +24,7 @@ from tensorflow.contrib.autograph import utils from tensorflow.contrib.autograph.impl import api from tensorflow.contrib.autograph.impl import conversion from tensorflow.python.framework import constant_op +from tensorflow.python.keras._impl.keras.engine import training from tensorflow.python.platform import test @@ -78,6 +79,65 @@ class ConversionTest(test.TestCase): conversion_map.dependency_cache[f].body[0].body[0].value.func.id) self.assertEqual('tf__g', conversion_map.dependency_cache[g].name) + def test_entity_to_graph_class_hierarchy(self): + + class TestBase(object): + + def __init__(self, x='base'): + self.x = x + + def foo(self): + return self.x + + def bar(self): + return self.x + + class TestSubclass(TestBase): + + def __init__(self, y): + super(TestSubclass, self).__init__('sub') + self.y = y + + def foo(self): + return self.y + + def baz(self): + return self.y + + conversion_map = self._simple_conversion_map() + conversion.entity_to_graph(TestSubclass, conversion_map, None, None) + + self.assertTrue(TestBase in conversion_map.dependency_cache) + self.assertTrue(TestSubclass in conversion_map.dependency_cache) + self.assertEqual('TfTestBase', + conversion_map.dependency_cache[TestBase].body[-1].name) + self.assertEqual( + 'TfTestSubclass', + conversion_map.dependency_cache[TestSubclass].body[-1].name) + + def test_entity_to_graph_class_hierarchy_whitelisted(self): + + class TestSubclass(training.Model): + + def __init__(self, y): + super(TestSubclass, self).__init__() + self.built = False + + def call(self, x): + return 3 * x + + conversion_map = self._simple_conversion_map() + conversion.entity_to_graph(TestSubclass, conversion_map, None, None) + + self.assertTrue(TestSubclass in conversion_map.dependency_cache) + self.assertFalse(training.Model in conversion_map.dependency_cache) + self.assertEqual( + 'Model', + conversion_map.dependency_cache[TestSubclass].body[0].names[0].name) + self.assertEqual( + 'TfTestSubclass', + conversion_map.dependency_cache[TestSubclass].body[-1].name) + def test_entity_to_graph_lambda(self): f = lambda a: a From 40f77655affb162d32b7d4861fa68c35fc3d8f7a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 06:58:34 -0700 Subject: [PATCH 0414/1734] Update the Colorbot demo to use a Keras model in addition to the Estimator. PiperOrigin-RevId: 193508874 --- ...imator.ipynb => rnn_keras_estimator.ipynb} | 677 +++++------------- 1 file changed, 167 insertions(+), 510 deletions(-) rename tensorflow/contrib/autograph/examples/notebooks/{rnn_colorbot_estimator.ipynb => rnn_keras_estimator.ipynb} (50%) diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb similarity index 50% rename from tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb rename to tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb index 7f5e4d4ac12..324b23c24b5 100644 --- a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb +++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb @@ -62,7 +62,7 @@ } }, "source": [ - "# Case study: building an RNN\n" + "# Case study: training a custom RNN, using Keras and Estimators\n" ] }, { @@ -118,6 +118,16 @@ " length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n", " return rgb, chars, length\n", "\n", + "\n", + "def set_static_batch_shape(batch_size):\n", + " def apply(rgb, chars, length):\n", + " rgb.set_shape((batch_size, None))\n", + " chars.set_shape((batch_size, None, 256))\n", + " length.set_shape((batch_size,))\n", + " return rgb, chars, length\n", + " return apply\n", + "\n", + "\n", "def load_dataset(data_dir, url, batch_size, training=True):\n", " \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n", " path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n", @@ -129,7 +139,10 @@ " if training:\n", " dataset = dataset.shuffle(buffer_size=3000)\n", " dataset = dataset.padded_batch(\n", - " batch_size, padded_shapes=([None], [None, None], []))\n", + " batch_size, padded_shapes=((None,), (None, 256), ()))\n", + " # To simplify the model code, we statically set as many of the shapes that we\n", + " # know.\n", + " dataset = dataset.map(set_static_batch_shape(batch_size))\n", " return dataset" ] }, @@ -145,7 +158,8 @@ "source": [ "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n", "\n", - "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode." + "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode.\n", + "We use Keras to define the model, and we will train it using Estimators." ] }, { @@ -166,70 +180,72 @@ }, "outputs": [], "source": [ - "class RnnColorbot(object):\n", - " \"\"\"Holds the parameters of the colorbot model.\"\"\"\n", + "@autograph.convert()\n", + "class RnnColorbot(tf.keras.Model):\n", + " \"\"\"RNN Colorbot model.\"\"\"\n", "\n", " def __init__(self):\n", + " super(RnnColorbot, self).__init__()\n", " self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n", " self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n", " self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n", "\n", + "\n", + " def _rnn_layer(self, chars, cell, batch_size, training):\n", + " \"\"\"A single RNN layer.\n", + "\n", + " Args:\n", + " chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n", + " cell: An object of type tf.contrib.rnn.LSTMBlockCell\n", + " batch_size: Int, the batch size to use\n", + " training: Boolean, whether the layer is used for training\n", + "\n", + " Returns:\n", + " A Tensor of shape (max_sequence_length, batch_size, output_size).\n", + " \"\"\"\n", + " hidden_outputs = []\n", + " autograph.utils.set_element_type(hidden_outputs, tf.float32)\n", + " state, output = cell.zero_state(batch_size, tf.float32)\n", + " for ch in chars:\n", + " cell_output, (state, output) = cell.call(ch, (state, output))\n", + " hidden_outputs.append(cell_output)\n", + " hidden_outputs = hidden_outputs.stack()\n", + " if training:\n", + " hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n", + " return hidden_outputs\n", + "\n", + " def build(self, _):\n", + " \"\"\"Creates the model variables. See keras.Model.build().\"\"\"\n", " self.lower_cell.build(tf.TensorShape((None, 256)))\n", " self.upper_cell.build(tf.TensorShape((None, 256)))\n", - " self.relu_layer.build(tf.TensorShape((None, 128)))\n", + " self.relu_layer.build(tf.TensorShape((None, 128))) \n", + " self.built = True\n", "\n", "\n", - "def rnn_layer(chars, cell, batch_size, training):\n", - " \"\"\"A simple RNN layer.\n", - " \n", - " Args:\n", - " chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n", - " cell: An object of type tf.contrib.rnn.LSTMBlockCell\n", - " batch_size: Int, the batch size to use\n", - " training: Boolean, whether the layer is used for training\n", + " def call(self, inputs, training=False):\n", + " \"\"\"The RNN model code. Uses Eager and \n", "\n", - " Returns:\n", - " A Tensor of shape (max_sequence_length, batch_size, output_size).\n", - " \"\"\"\n", - " hidden_outputs = []\n", - " autograph.utils.set_element_type(hidden_outputs, tf.float32)\n", - " state, output = cell.zero_state(batch_size, tf.float32)\n", - " for ch in chars:\n", - " cell_output, (state, output) = cell.call(ch, (state, output))\n", - " hidden_outputs.append(cell_output)\n", - " hidden_outputs = hidden_outputs.stack()\n", - " if training:\n", - " hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n", - " return hidden_outputs\n", + " The model consists of two RNN layers (made by lower_cell and upper_cell),\n", + " followed by a fully connected layer with ReLU activation.\n", "\n", + " Args:\n", + " inputs: A tuple (chars, length)\n", + " training: Boolean, whether the layer is used for training\n", "\n", - "@autograph.convert(recursive=True)\n", - "def model(inputs, colorbot, batch_size, training):\n", - " \"\"\"RNNColorbot model.\n", - " \n", - " The model consists of two RNN layers (made by lower_cell and upper_cell),\n", - " followed by a fully connected layer with ReLU activation.\n", - " \n", - " Args:\n", - " inputs: A tuple (chars, length)\n", - " colorbot: An object of type RnnColorbot\n", - " batch_size: Int, the batch size to use\n", - " training: Boolean, whether the layer is used for training\n", - " \n", - " Returns:\n", - " A Tensor of shape (batch_size, 3) - the model predictions.\n", - " \"\"\"\n", - " (chars, length) = inputs\n", - " seq = tf.transpose(chars, [1, 0, 2])\n", - " seq.set_shape((None, batch_size, 256))\n", + " Returns:\n", + " A Tensor of shape (batch_size, 3) - the model predictions.\n", + " \"\"\"\n", + " chars, length = inputs\n", + " batch_size = chars.shape[0]\n", + " seq = tf.transpose(chars, (1, 0, 2))\n", "\n", - " seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n", - " seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n", + " seq = self._rnn_layer(seq, self.lower_cell, batch_size, training)\n", + " seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n", "\n", - " # Grab just the end-of-sequence from each output.\n", - " indices = tf.stack([length - 1, range(batch_size)], axis=1)\n", - " sequence_ends = tf.gather_nd(seq, indices)\n", - " return colorbot.relu_layer(sequence_ends)\n", + " # Grab just the end-of-sequence from each output.\n", + " indices = tf.stack([length - 1, range(batch_size)], axis=1)\n", + " sequence_ends = tf.gather_nd(seq, indices)\n", + " return self.relu_layer(sequence_ends)\n", "\n", "@autograph.convert()\n", "def loss_fn(labels, predictions):\n", @@ -246,9 +262,9 @@ } }, "source": [ - "We will now create the model function for the estimator.\n", + "We will now create the model function for the custom Estimator.\n", "\n", - "In the model function, we simply call the converted functions that we defined above - that's it!" + "In the model function, we simply use the model class we defined above - that's it!" ] }, { @@ -275,14 +291,12 @@ " sequence_length = features['sequence_length']\n", " inputs = (chars, sequence_length)\n", "\n", - " # Create the model components.\n", - " # Simply calling the AutoGraph-ed functions and objects just works!\n", + " # Create the model. Simply using the AutoGraph-ed class just works!\n", " colorbot = RnnColorbot()\n", - " \n", - " batch_size = params['batch_size']\n", + " colorbot.build(None)\n", "\n", " if mode == tf.estimator.ModeKeys.TRAIN:\n", - " predictions = model(inputs, colorbot, batch_size, training=True)\n", + " predictions = colorbot(inputs, training=True)\n", " loss = loss_fn(labels, predictions)\n", "\n", " learning_rate = params['learning_rate']\n", @@ -292,14 +306,13 @@ " return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n", "\n", " elif mode == tf.estimator.ModeKeys.EVAL:\n", - " predictions = model(inputs, colorbot, batch_size, training=False)\n", + " predictions = colorbot(inputs)\n", " loss = loss_fn(labels, predictions)\n", "\n", " return tf.estimator.EstimatorSpec(mode, loss=loss)\n", - " \n", + "\n", " elif mode == tf.estimator.ModeKeys.PREDICT:\n", - " # For prediction, we expect single tensors.\n", - " predictions = model(inputs, colorbot, 1, training=False)\n", + " predictions = colorbot(inputs)\n", "\n", " predictions = tf.minimum(predictions, 1.0)\n", " return tf.estimator.EstimatorSpec(mode, predictions=predictions)" @@ -368,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": 7, "metadata": { "colab": { "autoexec": { @@ -379,9 +392,9 @@ }, "colab_type": "code", "executionInfo": { - "elapsed": 10064, + "elapsed": 10604, "status": "ok", - "timestamp": 1523580419240, + "timestamp": 1524095272039, "user": { "displayName": "", "photoUrl": "", @@ -390,7 +403,7 @@ "user_tz": 240 }, "id": "2pg1AfbxBJQq", - "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210", + "outputId": "9c924b4f-06e1-4538-976c-a3e1ddac5660", "slideshow": { "slide_type": "-" } @@ -400,7 +413,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Eval loss at step 100: 0.0665446\n" + "Eval loss at step 100: 0.0674834\n" ] } ], @@ -444,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": 8, "metadata": { "colab": { "autoexec": { @@ -455,9 +468,9 @@ }, "colab_type": "code", "executionInfo": { - "elapsed": 31286, + "elapsed": 7990, "status": "ok", - "timestamp": 1523580450579, + "timestamp": 1524095280105, "user": { "displayName": "", "photoUrl": "", @@ -466,7 +479,7 @@ "user_tz": 240 }, "id": "dxHex2tUN_10", - "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8", + "outputId": "2b889e5a-b9ed-4645-bf03-d98f26c72101", "slideshow": { "slide_type": "slide" } @@ -478,7 +491,7 @@ "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e" + "\u003cIPython.core.display.HTML at 0x7f3f36aa6cd0\u003e" ] }, "metadata": { @@ -494,7 +507,7 @@ "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e" + "\u003cIPython.core.display.HTML at 0x7f3eca67f7d0\u003e" ] }, "metadata": { @@ -510,7 +523,7 @@ "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e" + "\u003cIPython.core.display.HTML at 0x7f3eca67f8d0\u003e" ] }, "metadata": { @@ -523,11 +536,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n", - "//# sourceURL=js_a0db480422" + "window[\"e8ddfa22-4362-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"elementId\": \"id1\", \"borderColor\": [\"#a7a7a7\"], \"contentHeight\": [\"initial\"], \"tabNames\": [\"RNN Colorbot\"], \"location\": \"top\", \"initialSelection\": 0});\n", + "//# sourceURL=js_71b9087b6d" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f950\u003e" ] }, "metadata": { @@ -540,11 +553,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_d2a46ea291" + "window[\"e8ddfa23-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_e390445f33" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e" ] }, "metadata": { @@ -557,11 +570,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_0a8262c6e9" + "window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_241dd76d85" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e" ] }, "metadata": { @@ -575,11 +588,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_e32f85ccd2" + "window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_60c64e3d50" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e" ] }, "metadata": { @@ -593,11 +606,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_eaee748b21" + "window[\"e8ddfa26-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_14ea437cbd" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e" ] }, "metadata": { @@ -611,11 +624,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_2befe06587" + "window[\"e8ddfa27-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_09294c2226" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fcd0\u003e" ] }, "metadata": { @@ -629,11 +642,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_8ec4aeeb25" + "window[\"ec965514-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_e5e8266997" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e" ] }, "metadata": { @@ -647,11 +660,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_9f9f4574f1" + "window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_07a097f0ee" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc90\u003e" ] }, "metadata": { @@ -665,11 +678,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_bcccd8f300" + "window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_790d669ca8" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f8d0\u003e" ] }, "metadata": { @@ -683,11 +696,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_2c056cee72" + "window[\"ec965517-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_d30df771f0" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e" ] }, "metadata": { @@ -701,11 +714,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_c853c3f58b" + "window[\"ec965518-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_8a43a2da4b" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e" ] }, "metadata": { @@ -718,369 +731,9 @@ }, { "data": { - "application/javascript": [ - "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_e5730ab00d" - ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMBJREFUeJzt3F+I1XX+x/G32zjiFERUpgaFd2JBzOg5joX4h0SiMgmM\n/uhVGIlgFBlERGB3hUEkhkRdtDfRP1ACL6KpLBqcguxCjEAkmGamQcSohFHzsxe7O6zssvsydtff\n+ns8rs758j3f8z7fiyef7/k3o7XWCiDwh4s9APC/QzCAmGAAMcEAYoIBxAQDiAkGF8XTTz9d3W63\n7rvvvhoZGakVK1Zc7JEICMYlbvXq1TU8PHyxxzjPV199VcPDw/XZZ5/V22+/XVVVM2bMuMhTkRAM\n/qt+++23+uGHH+r666+vWbNmXexxuECCcQl76qmnanx8vLZs2VIDAwP1+uuv1zfffFP3339/dTqd\nWr9+fY2MjEzvv2nTpnr55ZfrgQceqIGBgXr44Yfr5MmTVVV1+vTp2r59ey1durQ6nU5t2LChTpw4\nUVVVk5OTtWXLllq6dGmtXbu23nnnnelj7tq1q7Zt21bbt2+vJUuW1HvvvVfPPvtsHTp0qAYGBmrX\nrl1/N/fRo0dr06ZN1el06u67766hoaGqqhodHa1OpzO93zPPPFO33nrr9P3t27fXm2+++e89iZyv\ncUlbtWpVGx4ebq21NjEx0brdbjtw4EBrrbUvvviidbvdduLEidZaaxs3bmxr1qxp33//fZuammob\nN25sO3fubK219tZbb7VHH320TU1NtXPnzrXDhw+3X375pbXW2kMPPdR27NjRTp8+3Y4cOdIGBwen\nn/OVV15pN910U/voo49aa61NTU21999/vz344IPTMx48eLCtWLGitdbamTNn2po1a9qePXvamTNn\n2vDwcOvv72/Hjh2bfj2HDx9urbW2du3advvtt7ejR4+21lpbuXJlO3LkyH/qVNJas8L4f6D95edC\n+/btq5UrV9by5curqmrZsmV1880316effjq977333ls33HBD9fb21h133FFHjhypqqqenp46efJk\nHTt2rGbMmFGLFi2qyy+/vCYmJurrr7+uJ598smbOnFkLFy6sDRs21N69e6eP2d/fX6tXr66qqt7e\n3n8666FDh+rUqVP1yCOPVE9PTw0ODtaqVavqgw8+qKqqJUuW1MjISB0/fryqqtauXVtffvlljY6O\n1q+//loLFy78N501/pGeiz0A/z1jY2O1f//++vjjj6vqzyE5e/ZsLVu2bHqfa665Zvr27Nmz69Sp\nU1VVdc8999TExEQ98cQT9fPPP9e6devq8ccfr8nJybryyitr9uzZ04+bP39+HT58ePr+3Llz4xkn\nJydr3rx5522bP39+TU5OVlVVp9OpoaGhuu6666rb7Va32629e/dWb29vLV68+ALOBr+HYFzi/vbT\nh3nz5tX69etrx44dF3ycnp6e2rp1a23durXGxsZq8+bNtWDBgrrtttvqp59+qlOnTlVfX19VVY2P\nj9ecOXP+4Qz/ypw5c2p8fPy8bWNjY7VgwYKqqup2u/Xiiy/WvHnzqtPp1MDAQD333HPV29tb3W73\ngl8XF8YlySXu2muvrdHR0aqqWrduXQ0NDdXnn39e586dq6mpqRoZGakff/zxXx7n4MGD9d1339W5\nc+eqr6+venp66rLLLqu5c+dWf39/vfTSS3X69On69ttv6913361169b9rnlvueWW6uvrq9dee63O\nnj1bBw8erE8++aTuvPPOqqq68cYba9asWbVv377qdDp1xRVX1NVXX10ffvjheW+I8p8hGJe4zZs3\n1+7du6vb7db+/ftr9+7dtWfPnlq2bFmtWrWq3njjjen3OP7ZSuD48eO1bdu2Wrx4cd111121dOnS\n6Sjs3LmzRkdHa/ny5bVt27Z67LHHzrvMuRAzZ86sV199tQ4cOFCDg4P1/PPP1wsvvDC9wqj68yrj\nqquumr7U+WsoFi1a9Luek9yM1vyBDpCxwgBiggHEBAOICQYQ+z/7PYzjf/QRGVxM12z68u+2WWEA\nMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE\nBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhAT\nDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMIDajtdYu9hDA/wYrDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4j9CY2LTAbbRbWuAAAAAElFTkSuQmCC\n", "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_a897ef7e24" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_565fa3d154" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_222e0dc6af" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_831db7458f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_adb576c6eb" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_9418f2d32f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_3fad25f306" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_45b9340e7b" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_bec9896d44" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_460b91ad4a" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_7dedd0b037" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_4b1c977dc7" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_d64fedfcf9" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_3e8c929c3f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_9f9cf2b76f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_b402e6b587" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_9b7d66db72" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_11ec213a3f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_9c055e4bc0" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n", - "text/plain": [ - "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e" + "\u003cmatplotlib.figure.Figure at 0x7f3ecc00bf10\u003e" ] }, "metadata": { @@ -1095,11 +748,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_ba6a061307" + "window[\"ec965519-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_893ad561f4" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55c90\u003e" ] }, "metadata": { @@ -1113,11 +766,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_83e3496927" + "window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_2d99e0ac17" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe50\u003e" ] }, "metadata": { @@ -1131,11 +784,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_f437bab20d" + "window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_5c19462e32" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55dd0\u003e" ] }, "metadata": { @@ -1149,11 +802,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_93aa63450e" + "window[\"ec96551c-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_b9c8b7567b" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55a50\u003e" ] }, "metadata": { @@ -1167,11 +820,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_aca189bea5" + "window[\"ec96551d-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_fd05186348" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55810\u003e" ] }, "metadata": { @@ -1185,10 +838,10 @@ { "data": { "text/html": [ - "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e" + "\u003cdiv class=id_888646481 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e" + "\u003cIPython.core.display.HTML at 0x7f3f32414810\u003e" ] }, "metadata": { @@ -1203,11 +856,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n", - "//# sourceURL=js_5df1fe383e" + "window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n", + "//# sourceURL=js_efef96e882" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e" ] }, "metadata": { @@ -1222,11 +875,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", - "//# sourceURL=js_c62c7174ad" + "window[\"ec96551f-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", + "//# sourceURL=js_6eca889864" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e" ] }, "metadata": { @@ -1241,11 +894,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n", - "//# sourceURL=js_2e2201ddc4" + "window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 input\");\n", + "//# sourceURL=js_f02070cc60" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b553d0\u003e" ] }, "metadata": { @@ -1260,11 +913,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n", - "//# sourceURL=js_288e5283d6" + "window[\"ed8ea973-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"].remove();\n", + "//# sourceURL=js_ed9faba660" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a95450\u003e" ] }, "metadata": { @@ -1279,11 +932,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n", - "//# sourceURL=js_2f31d19cde" + "window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n", + "//# sourceURL=js_f3458d7074" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a95250\u003e" ] }, "metadata": { @@ -1298,11 +951,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", - "//# sourceURL=js_2fbbcda050" + "window[\"ed8ea975-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", + "//# sourceURL=js_3ffd97bd6f" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a953d0\u003e" ] }, "metadata": { @@ -1317,11 +970,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_f94d975cf3" + "window[\"ed8ea976-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_7f73e8bcca" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e" ] }, "metadata": { @@ -1337,7 +990,7 @@ "def predict_input_fn(color_name):\n", " \"\"\"An input function for prediction.\"\"\"\n", " _, chars, sequence_length = parse(color_name)\n", - " \n", + "\n", " # We create a batch of a single element.\n", " features = {\n", " 'chars': tf.expand_dims(chars, 0),\n", @@ -1385,7 +1038,11 @@ "colab": { "collapsed_sections": [], "default_view": {}, - "name": "RNN Colorbot using Estimators", + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "name": "RNN Colorbot using Keras and Estimators", "provenance": [ { "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl", From b4c37a452d2ed1d1c29ceb70127c4ef6434c44ca Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 07:13:03 -0700 Subject: [PATCH 0415/1734] Teach the conditinal simplifier about sharding. PiperOrigin-RevId: 193510638 --- tensorflow/compiler/xla/service/conditional_simplifier.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc index f35de080853..e560abc87f8 100644 --- a/tensorflow/compiler/xla/service/conditional_simplifier.cc +++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc @@ -69,7 +69,7 @@ static StatusOr TryRemoveConditional(HloInstruction* conditional) { conditional->shape(), {conditional->mutable_operand(2)}, conditional->false_computation())); } - + conditional->SetupDerivedInstruction(call_op); TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op)); TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status()); From 1a2eb108a3e513a4f4609b9d421277bc222e5eb0 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 19 Apr 2018 15:03:05 +0000 Subject: [PATCH 0416/1734] Update docs for tf.unstack with respect to numpy. In 18692 an issue was raised over whether tf.unstack is compatible with numpy.unstack (specified in current docs) or numpy.split. It looks like there is no numpy.unstack. And for numpy.split, it is not compatible with tf.unstack. The tf.split is very close to numpy.split. However, the second arg `num_or_size_splits` in `tf.split` requires the number of the splits, while the second arg `indices_or_sections` in `numpy.split` requires the index of the splits. For that reason the tf.split is not compatible with numpy.split as well. According to the above this fix simply removes `The numpy equivalent` part in the docs of tf.unstack. This fix fixes 18692. Signed-off-by: Yong Tang --- tensorflow/python/ops/array_ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index ceeabe090df..23202ae28e1 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1057,9 +1057,7 @@ def unstack(value, num=None, axis=0, name="unstack"): `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`. Etc. - This is the opposite of stack. The numpy equivalent is - - tf.unstack(x, n) = np.unstack(x) + This is the opposite of stack. Args: value: A rank `R > 0` `Tensor` to be unstacked. From 50f6683ca50e6d4e7008d6d1b437b407d6a62e92 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 19 Apr 2018 09:13:21 -0700 Subject: [PATCH 0417/1734] Add shape check for batch related Dataset ops (#18683) * Add shape check for PrefetchDataset Signed-off-by: Yong Tang * Add BatchDataset shape check Signed-off-by: Yong Tang * Add shape check for SlideDataset Signed-off-by: Yong Tang * Add shape check for DenseToSparseBatchDataset Signed-off-by: Yong Tang * Sanitize with clang-format -i --style=Google Signed-off-by: Yong Tang --- tensorflow/core/ops/dataset_ops.cc | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index 34f2c612ec6..c63e485f6c8 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -199,7 +199,12 @@ REGISTER_OP("PrefetchDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // buffer_size should be a scalar. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("ScanDataset") .Input("input_dataset: variant") @@ -283,7 +288,12 @@ REGISTER_OP("BatchDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // batch_size should be a scalar. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + return shape_inference::ScalarShape(c); + }); // TODO(mrry): move SlideDataset to contrib in the future. REGISTER_OP("SlideDataset") @@ -293,7 +303,13 @@ REGISTER_OP("SlideDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // window_size and stride should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("PaddedBatchDataset") .Input("input_dataset: variant") @@ -323,7 +339,14 @@ REGISTER_OP("DenseToSparseBatchDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // batch_size should be a scalar. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + // row_shape should be a 1-D vector. + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("RangeDataset") .Input("start: int64") From b71b6b8ca9ade8b39d77f0373210fe58dfccf4f4 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 19 Apr 2018 09:13:35 -0700 Subject: [PATCH 0418/1734] Shape validation with random/shuffle related Dataset ops (#18682) * Add shape check for CacheDataset Signed-off-by: Yong Tang * Add shape check for ShuffleAndRepeatDataset Signed-off-by: Yong Tang * Add check for ShuffleDataset Signed-off-by: Yong Tang * Add shape check for RandomDataset Signed-off-by: Yong Tang * Add RangeDataset shape check Signed-off-by: Yong Tang * Sanitize with clang-format -i --style=Google Signed-off-by: Yong Tang --- tensorflow/core/ops/dataset_ops.cc | 43 ++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index c63e485f6c8..dae0c0eae45 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -357,7 +357,14 @@ REGISTER_OP("RangeDataset") .Attr("output_shapes: list(shape) >= 1") .SetIsStateful() // TODO(b/65524810): Source dataset ops must be marked // stateful to inhibit constant folding. - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // start, stop, and step should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("RandomDataset") .Input("seed: int64") @@ -367,7 +374,13 @@ REGISTER_OP("RandomDataset") .Attr("output_shapes: list(shape) >= 1") .SetIsStateful() // TODO(b/65524810): Source dataset ops must be marked // stateful to inhibit constant folding. - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // buffer_size, seed, and seed2 should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("ShuffleDataset") .Input("input_dataset: variant") @@ -378,7 +391,14 @@ REGISTER_OP("ShuffleDataset") .Attr("reshuffle_each_iteration: bool = true") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // buffer_size, seed, and seed2 should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("ShuffleAndRepeatDataset") .Input("input_dataset: variant") @@ -389,7 +409,15 @@ REGISTER_OP("ShuffleAndRepeatDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // buffer_size, seed, seed2, and count should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("CacheDataset") .Input("input_dataset: variant") @@ -397,7 +425,12 @@ REGISTER_OP("CacheDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // filename should be a scalar. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("TextLineDataset") .Input("filenames: string") From 76619c8dea0e480fd48e3b4dcfe0249eb24216b8 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Thu, 19 Apr 2018 09:13:53 -0700 Subject: [PATCH 0419/1734] Validation in shape functions of Dataset ops (#18680) * Add shape check for PrependFromQueueAndPaddedBatchDataset Signed-off-by: Yong Tang * Add comment for shape check Signed-off-by: Yong Tang * Add shape check for FixedLengthRecordDataset Signed-off-by: Yong Tang * Add check for filenames as well Signed-off-by: Yong Tang * Clang-format -i --style=google for file format Signed-off-by: Yong Tang * Add shape check for SqlDataset Signed-off-by: Yong Tang --- tensorflow/core/ops/dataset_ops.cc | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index dae0c0eae45..869bef80409 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -459,7 +459,14 @@ REGISTER_OP("SqlDataset") .Attr("output_shapes: list(shape) >= 1") .SetIsStateful() // TODO(b/65524810): Source dataset ops must be marked // stateful to inhibit constant folding. - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // driver_name, data_source_name, and query should be scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("FixedLengthRecordDataset") .Input("filenames: string") @@ -470,7 +477,18 @@ REGISTER_OP("FixedLengthRecordDataset") .Output("handle: variant") .SetIsStateful() // TODO(b/65524810): Source dataset ops must be marked // stateful to inhibit constant folding. - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // `filenames` must be a scalar or a vector. + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused)); + // header_bytes, record_bytes, footer_bytes, buffer_size should be + // scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("TFRecordDataset") .Input("filenames: string") @@ -609,7 +627,12 @@ REGISTER_OP("PrependFromQueueAndPaddedBatchDataset") // length of `output_types` is `N`, the `output_shapes` are // (as far as possible to tell statically) compatible with `padded_shapes`, // and that `padding_values` are all scalars. - .SetShapeFn(shape_inference::ScalarShape); + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // batch_size should be a scalar. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + return shape_inference::ScalarShape(c); + }); REGISTER_OP("EnqueueInQueueDataset") .Input("queue: variant") From 7e735e5be811bacfa4e16aeae2e8aa53ef209ea6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 09:13:47 -0700 Subject: [PATCH 0420/1734] Pin pip to version 9.0.3 * This is because pip 10 is still unstable in some distros * reference: https://github.com/pypa/pip/issues/5240 PiperOrigin-RevId: 193525542 --- tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index fc137aeeedf..9644277fabf 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -19,11 +19,11 @@ set -e # We don't apt-get install so that we can install a newer version of pip. # Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9 if $(cat /etc/*-release | grep -q 14.04); then - easy_install -U pip - easy_install3 -U pip + easy_install -U pip==9.0.3 + easy_install3 -U pip==9.0.3 else - pip2 install --upgrade pip - pip3 install --upgrade pip + pip2 install --upgrade pip==9.0.3 + pip3 install --upgrade pip==9.0.3 fi # Install pip packages from whl files to avoid the time-consuming process of From 51a26bb2f3e66fc79a5870f6eed88f60de995d4a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 09:23:35 -0700 Subject: [PATCH 0421/1734] [TF:XLA] Change HloTestBase::ExecuteNoHloPasses to return a literal directly. PiperOrigin-RevId: 193526900 --- tensorflow/compiler/xla/tests/hlo_test_base.cc | 8 +++++--- tensorflow/compiler/xla/tests/hlo_test_base.h | 2 +- tensorflow/compiler/xla/tests/tuple_test.cc | 3 +-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index c5afe0c3e05..9984aba089b 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -113,11 +113,13 @@ StatusOr> HloTestBase::Execute( return test_runner_.Execute(std::move(module), arguments); } -StatusOr> HloTestBase::ExecuteNoHloPasses( +std::unique_ptr HloTestBase::ExecuteNoHloPasses( std::unique_ptr module, tensorflow::gtl::ArraySlice arguments) { - return test_runner_.Execute(std::move(module), arguments, - /*run_hlo_passes=*/false); + return test_runner_ + .Execute(std::move(module), arguments, + /*run_hlo_passes=*/false) + .ValueOrDie(); } std::unique_ptr HloTestBase::ExecuteAndTransfer( diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h index 28d7ab09cb6..79fcea9403e 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_test_base.h @@ -99,7 +99,7 @@ class HloTestBase : public ::testing::Test { // Same as above, except the module will be executed without running any HLO // passes on it. - StatusOr> ExecuteNoHloPasses( + std::unique_ptr ExecuteNoHloPasses( std::unique_ptr module, tensorflow::gtl::ArraySlice arguments); diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc index 098be6d7aab..61d0fa02aba 100644 --- a/tensorflow/compiler/xla/tests/tuple_test.cc +++ b/tensorflow/compiler/xla/tests/tuple_test.cc @@ -535,8 +535,7 @@ TEST_F(TupleHloTest, HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); auto param = Literal::MakeTupleOwned(Literal::CreateR1({1, 2, 3})); - TF_ASSERT_OK_AND_ASSIGN(auto result, - ExecuteNoHloPasses(std::move(module), {param.get()})); + auto result = ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( *result, *Literal::MakeTupleOwned(Literal::CreateR2({{1, 2, 3}})))); From 0b3950d67bcb07c11f87bd3c2da554017bff0674 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Fri, 20 Apr 2018 00:35:54 +0800 Subject: [PATCH 0422/1734] Fix code block rendering in several api definitions --- tensorflow/core/api_def/base_api/api_def_Pad.pbtxt | 1 + tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt index e45e2375eb9..ee4aad78993 100644 --- a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt @@ -24,5 +24,6 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0] [0, 0, 2, 2, 0, 0] [0, 0, 0, 0, 0, 0]] ``` + END } diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt index b9e75caf02b..37ac10dddb7 100644 --- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt @@ -44,6 +44,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following: out[i] = (in[i] - min_range) * range(T) / (max_range - min_range) if T == qint8, out[i] -= (range(T) + 1) / 2.0 ``` + here `range(T) = numeric_limits::max() - numeric_limits::min()` *MIN_COMBINED Mode Example* @@ -87,6 +88,7 @@ choosing to elide the lowest possible value for symmetry (e.g., output range is We first find the range of values in our tensor. The range we use is always centered on 0, so we find m such that + ```c++ m = max(abs(input_min), abs(input_max)) ``` @@ -95,6 +97,7 @@ Our input tensor range is then `[-m, m]`. Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`. If T is signed, this is + ``` num_bits = sizeof(T) * 8 [min_fixed, max_fixed] = @@ -102,16 +105,19 @@ If T is signed, this is ``` Otherwise, if T is unsigned, the fixed-point range is + ``` [min_fixed, max_fixed] = [0, (1 << num_bits) - 1] ``` From this we compute our scaling factor, s: + ```c++ s = (max_fixed - min_fixed) / (2 * m) ``` Now we can quantize the elements of our tensor: + ```c++ result = round(input * s) ``` From 1f1d7b88717847f590987ee40efbe970bb591275 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 09:34:24 -0700 Subject: [PATCH 0423/1734] Disable dlopen error of libneuralnetworks for non-Android platforms. PiperOrigin-RevId: 193528346 --- tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h index 85aca368740..ace4827d8ce 100644 --- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h +++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h @@ -34,10 +34,13 @@ limitations under the License. inline void* loadLibrary(const char* name) { // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn // api RT - void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); + void* handle = nullptr; +#ifdef __ANDROID__ + handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); if (handle == nullptr) { NNAPI_LOG("nnapi error: unable to open library %s", name); } +#endif return handle; } From c173157bdc132460c6f424a9803221e74fc73f59 Mon Sep 17 00:00:00 2001 From: Saurabh Saxena Date: Thu, 19 Apr 2018 09:37:20 -0700 Subject: [PATCH 0424/1734] [tf.data] Add checkpointing support for MapAndBatchDataset. PiperOrigin-RevId: 193528712 --- .../kernel_tests/batch_dataset_op_test.py | 31 ++ .../kernels/data/map_and_batch_dataset_op.cc | 277 +++++++++++++++++- 2 files changed, 302 insertions(+), 6 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index e1ec60d7c9f..a4a0ce79b60 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -681,6 +681,37 @@ class UnbatchDatasetSerializationTest( num_outputs) +class MapAndBatchDatasetSerializationTest( + dataset_serialization_test_base.DatasetSerializationTestBase): + + def testSerializationCore(self): + range_size = 11 + num_repeats = 2 + batch_size = 5 + total_outputs = range_size * num_repeats + num_outputs_drop_remainder = total_outputs // batch_size + num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size)) + num_parallel_batches = 2 + + def build_ds(range_start, drop_remainder=False): + + def _map_fn(x): + return math_ops.square(x) + + return dataset_ops.Dataset.range( + range_start, range_start + range_size).repeat(num_repeats).apply( + batching.map_and_batch( + map_func=_map_fn, + batch_size=batch_size, + num_parallel_batches=num_parallel_batches, + drop_remainder=drop_remainder)) + + self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15), + num_outputs_keep_remainder) + self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True), + num_outputs_drop_remainder) + + class PaddedBatchDatasetSerializationTest( dataset_serialization_test_base.DatasetSerializationTestBase): diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index aaf4dc73418..b8105552a0e 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -74,26 +74,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { OP_REQUIRES_OK(ctx, CapturedFunction::Create( func_, std::move(other_arguments), &captured_func)); - *output = new Dataset(input, batch_size, num_parallel_batches, - drop_remainder, output_types_, output_shapes_, + *output = new Dataset(ctx, input, batch_size, num_parallel_batches, + drop_remainder, output_types_, output_shapes_, func_, std::move(captured_func), &ctx->eigen_cpu_device()); } private: - class Dataset : public DatasetBase { + class Dataset : public GraphDatasetBase { public: - Dataset(const DatasetBase* input, int64 batch_size, + Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size, int64 num_parallel_batches, bool drop_remainder, const DataTypeVector& output_types, const std::vector& output_shapes, + const NameAttrList& func, std::unique_ptr captured_func, const Eigen::ThreadPoolDevice* device) - : input_(input), + : GraphDatasetBase(ctx), + input_(input), batch_size_(batch_size), num_parallel_batches_(num_parallel_batches), drop_remainder_(drop_remainder), output_types_(output_types), output_shapes_(output_shapes), + map_fn_(func), captured_func_(std::move(captured_func)), device_(device) { input_->Ref(); @@ -117,6 +120,48 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; } + protected: + Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Node** output) const override { + TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name())); + Node* input_graph_node = nullptr; + TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node)); + Node* batch_size_node; + TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node)); + Node* num_parallel_batches_node; + TF_RETURN_IF_ERROR( + b->AddScalar(num_parallel_batches_, &num_parallel_batches_node)); + Node* drop_remainder_node; + TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node)); + + DataTypeVector other_arguments_types; + other_arguments_types.reserve(captured_func_->captured_inputs().size()); + std::vector other_arguments; + other_arguments.reserve(captured_func_->captured_inputs().size()); + for (const Tensor& t : captured_func_->captured_inputs()) { + Node* node; + TF_RETURN_IF_ERROR(b->AddTensor(t, &node)); + other_arguments.emplace_back(node); + other_arguments_types.emplace_back(t.dtype()); + } + AttrValue f; + b->BuildAttrValue(map_fn_, &f); + AttrValue other_arguments_types_attr; + b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr); + + TF_RETURN_IF_ERROR(b->AddDataset( + this, + {std::make_pair(0, input_graph_node), + std::make_pair(2, batch_size_node), + std::make_pair(3, num_parallel_batches_node), + std::make_pair(4, drop_remainder_node)}, // Single tensor inputs. + {std::make_pair(1, other_arguments)}, // Tensor list inputs. + {std::make_pair("f", f), + std::make_pair("Targuments", other_arguments_types_attr)}, // Attrs + output)); + return Status::OK(); + } + private: class Iterator : public DatasetIterator { public: @@ -217,9 +262,83 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { return status; } + protected: + Status SaveInternal(IteratorStateWriter* writer) override { + mutex_lock l(mu_); + if (current_batch_index_ == -1) { + // Iterator has not been used. Nothing to save. + return Status::OK(); + } + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"), + current_batch_index_)); + TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_)); + TF_RETURN_IF_ERROR(writer->WriteScalar( + full_name("invocation_results_size"), invocation_results_.size())); + for (size_t i = 0; i < invocation_results_.size(); ++i) { + TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i)); + } + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"), + batch_results_.size())); + for (size_t i = 0; i < batch_results_.size(); ++i) { + TF_RETURN_IF_ERROR(WriteBatchResultLocked(writer, i)); + } + return Status::OK(); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + mutex_lock l(mu_); + if (!reader->Contains(full_name("current_batch_index"))) { + // Iterator was never used so nothing to restore. + return Status::OK(); + } + { + int64 temp; + TF_RETURN_IF_ERROR( + reader->ReadScalar(full_name("current_batch_index"), &temp)); + current_batch_index_ = static_cast(temp); + if (current_batch_index_ != temp) { + return errors::Internal("Invalid value for current_batch_index ", + temp); + } + } + TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_)); + size_t invocation_results_size; + { + int64 temp; + TF_RETURN_IF_ERROR( + reader->ReadScalar(full_name("invocation_results_size"), &temp)); + invocation_results_size = static_cast(temp); + if (invocation_results_size != temp) { + return errors::Internal( + "Invalid value for invocation_results_size ", temp); + } + } + CHECK_EQ(invocation_results_.size(), invocation_results_size); + for (size_t i = 0; i < invocation_results_size; ++i) { + TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i)); + } + size_t batch_results_size; + { + int64 temp; + TF_RETURN_IF_ERROR( + reader->ReadScalar(full_name("batch_results_size"), &temp)); + batch_results_size = static_cast(temp); + if (batch_results_size != temp) { + return errors::Internal("Invalid value for batch_results_size ", + temp); + } + } + CHECK_EQ(batch_results_.size(), batch_results_size); + for (size_t i = 0; i < batch_results_size; ++i) { + TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i)); + } + return Status::OK(); + } + private: struct BatchResult { - mutex mu; + mutex mu ACQUIRED_AFTER(mu_); bool output_allocated GUARDED_BY(mu); std::vector output; std::unique_ptr counter; @@ -393,6 +512,151 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { return status; } + Status WriteInvocationResultLocked(IteratorStateWriter* writer, + size_t index) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + const InvocationResult& result = invocation_results_[index]; + string prefix = strings::StrCat("invocation_results_", index); + TF_RETURN_IF_ERROR(WriteStatusLocked( + writer, full_name(strings::StrCat(prefix, "_status")), + result.status)); + if (result.end_of_input) { + TF_RETURN_IF_ERROR(writer->WriteScalar( + full_name(strings::StrCat(prefix, "_end_of_input")), "")); + } + TF_RETURN_IF_ERROR(writer->WriteScalar( + full_name(strings::StrCat(prefix, "_return_values_size")), + result.return_values.size())); + for (size_t i = 0; i < result.return_values.size(); i++) { + TF_RETURN_IF_ERROR(writer->WriteTensor( + full_name(strings::StrCat(prefix, "_return_values_", i)), + result.return_values[i])); + } + return Status::OK(); + } + + Status ReadInvocationResultLocked(IteratorStateReader* reader, + size_t index) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + InvocationResult* result = &invocation_results_[index]; + string prefix = strings::StrCat("invocation_results_", index); + TF_RETURN_IF_ERROR(ReadStatusLocked( + reader, full_name(strings::StrCat(prefix, "_status")), + &result->status)); + result->end_of_input = reader->Contains( + full_name(strings::StrCat(prefix, "_end_of_input"))); + size_t return_values_size; + { + int64 temp; + TF_RETURN_IF_ERROR(reader->ReadScalar( + full_name(strings::StrCat(prefix, "_return_values_size")), + &temp)); + return_values_size = static_cast(temp); + if (temp != return_values_size) { + return errors::Internal("Invalid value for return_values_size ", + return_values_size); + } + } + result->return_values.reserve(return_values_size); + for (size_t i = 0; i < return_values_size; i++) { + result->return_values.emplace_back(); + TF_RETURN_IF_ERROR(reader->ReadTensor( + full_name(strings::StrCat(prefix, "_return_values_", i)), + &result->return_values.back())); + } + return Status::OK(); + } + + Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to + // finish. This may delay saving a checkpoint by a bit but keeps the + // code clean and also saves us from checkpointing the state of the + // `BlockingCounter`. + batch_results_[index].counter->Wait(); + const BatchResult& result = batch_results_[index]; + string prefix = strings::StrCat("batch_results_", index); + { + mutex_lock l(batch_results_[index].mu); + if (result.output_allocated) { + TF_RETURN_IF_ERROR(writer->WriteScalar( + full_name(strings::StrCat(prefix, "_output_allocated")), "")); + } + } + TF_RETURN_IF_ERROR(writer->WriteScalar( + full_name(strings::StrCat(prefix, "_output_size")), + result.output.size())); + for (size_t i = 0; i < result.output.size(); i++) { + TF_RETURN_IF_ERROR(writer->WriteTensor( + full_name(strings::StrCat(prefix, "_output_", i)), + result.output[i])); + } + return Status::OK(); + } + + Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + BatchResult* result = &batch_results_[index]; + string prefix = strings::StrCat("batch_results_", index); + { + mutex_lock l(batch_results_[index].mu); + result->output_allocated = reader->Contains( + full_name(strings::StrCat(prefix, "_output_allocated"))); + // Simulate that the batch was fully generated. + batch_results_[index].counter.reset(new BlockingCounter(0)); + } + size_t output_size; + { + int64 temp; + TF_RETURN_IF_ERROR(reader->ReadScalar( + full_name(strings::StrCat(prefix, "_output_size")), &temp)); + output_size = static_cast(temp); + if (temp != output_size) { + return errors::Internal("Invalid value for output_size ", + output_size); + } + } + result->output.reserve(output_size); + for (size_t i = 0; i < output_size; i++) { + result->output.emplace_back(); + TF_RETURN_IF_ERROR(reader->ReadTensor( + full_name(strings::StrCat(prefix, "_output_", i)), + &result->output.back())); + } + return Status::OK(); + } + + Status WriteStatusLocked(IteratorStateWriter* writer, + const string& prefix, const Status& status) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + TF_RETURN_IF_ERROR( + writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")), + static_cast(status.code()))); + if (!status.ok()) { + TF_RETURN_IF_ERROR( + writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")), + status.error_message())); + } + return Status::OK(); + } + + Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix, + Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int64 code_int; + TF_RETURN_IF_ERROR(reader->ReadScalar( + full_name(strings::StrCat(prefix, "_code")), &code_int)); + error::Code code = static_cast(code_int); + + if (code != error::Code::OK) { + string error_message; + TF_RETURN_IF_ERROR(reader->ReadScalar( + full_name(strings::StrCat(prefix, "_msg")), &error_message)); + *status = Status(code, error_message); + } else { + *status = Status::OK(); + } + return Status::OK(); + } mutex mu_; int32 current_batch_index_ GUARDED_BY(mu_) = -1; const std::unique_ptr input_impl_ GUARDED_BY(mu_); @@ -407,6 +671,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { const bool drop_remainder_; const DataTypeVector output_types_; const std::vector output_shapes_; + const NameAttrList map_fn_; const std::unique_ptr captured_func_; const Eigen::ThreadPoolDevice* device_; // not owned }; From 436f1434060d7f370baae9661baacc6cf27415ec Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Thu, 19 Apr 2018 09:54:40 -0700 Subject: [PATCH 0425/1734] Create a skeleton tf.contrib.checkpoint. My plan for this is to incubate tools for working with object-based checkpoints: - Tools for managing dependency graphs, e.g. checkpointable lists/dictionaries - Inspecting/visualizing checkpoints - Listing variables and gathering initializers from a Checkpointable object and its dependencies - Verifying all variables are accessible as dependencies, which should make converting existing graph building Saver uses easier/safer. This CL includes none of those things, it just moves the split_dependency tool here instead of contrib/eager. PiperOrigin-RevId: 193531292 --- tensorflow/contrib/__init__.py | 1 + tensorflow/contrib/checkpoint/README.md | 2 + tensorflow/contrib/checkpoint/__init__.py | 29 +++++++++++ tensorflow/contrib/checkpoint/python/BUILD | 29 +++++++++++ .../python/split_dependency.py} | 8 ++-- .../python/split_dependency_test.py} | 4 +- tensorflow/contrib/cmake/python_modules.txt | 2 + tensorflow/contrib/cudnn_rnn/BUILD | 2 +- .../cudnn_rnn/python/ops/cudnn_rnn_ops.py | 4 +- tensorflow/contrib/eager/python/BUILD | 48 ++----------------- tensorflow/contrib/optimizer_v2/BUILD | 1 - tensorflow/tools/pip_package/BUILD | 1 - 12 files changed, 75 insertions(+), 56 deletions(-) create mode 100644 tensorflow/contrib/checkpoint/README.md create mode 100644 tensorflow/contrib/checkpoint/__init__.py create mode 100644 tensorflow/contrib/checkpoint/python/BUILD rename tensorflow/contrib/{eager/python/checkpointable_utils.py => checkpoint/python/split_dependency.py} (95%) rename tensorflow/contrib/{eager/python/checkpointable_utils_test.py => checkpoint/python/split_dependency_test.py} (96%) diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py index 36cc5144d07..0d163daa6e2 100644 --- a/tensorflow/contrib/__init__.py +++ b/tensorflow/contrib/__init__.py @@ -24,6 +24,7 @@ import os # Add projects here, they will show up under tf.contrib. from tensorflow.contrib import batching from tensorflow.contrib import bayesflow +from tensorflow.contrib import checkpoint from tensorflow.contrib import cloud from tensorflow.contrib import cluster_resolver from tensorflow.contrib import coder diff --git a/tensorflow/contrib/checkpoint/README.md b/tensorflow/contrib/checkpoint/README.md new file mode 100644 index 00000000000..d35c5bae3b7 --- /dev/null +++ b/tensorflow/contrib/checkpoint/README.md @@ -0,0 +1,2 @@ +Tools for working with object-based checkpoints produced by +`tf.train.Checkpoint`. diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py new file mode 100644 index 00000000000..70d7d2d8d79 --- /dev/null +++ b/tensorflow/contrib/checkpoint/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tools for working with object-based checkpoints. + + +For creating and managing dependencies: +@@split_dependency +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency +from tensorflow.python.util.all_util import remove_undocumented + +remove_undocumented(module_name=__name__) diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD new file mode 100644 index 00000000000..d57b01aab26 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/BUILD @@ -0,0 +1,29 @@ +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//tensorflow:internal"]) + +load("//tensorflow:tensorflow.bzl", "py_test") + +py_library( + name = "split_dependency", + srcs = ["split_dependency.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:training", + ], +) + +py_test( + name = "split_dependency_test", + srcs = ["split_dependency_test.py"], + deps = [ + ":split_dependency", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:test", + ], +) diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/checkpoint/python/split_dependency.py similarity index 95% rename from tensorflow/contrib/eager/python/checkpointable_utils.py rename to tensorflow/contrib/checkpoint/python/split_dependency.py index 30c4103c5aa..3aec8c96e90 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils.py +++ b/tensorflow/contrib/checkpoint/python/split_dependency.py @@ -1,4 +1,4 @@ -"""Utilities for working with Checkpointable objects.""" +"""Utility for creating multiple dependencies with synchronized save/restore.""" # Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +20,7 @@ from __future__ import print_function import functools from tensorflow.python.ops import control_flow_ops -from tensorflow.python.training import checkpointable as core_checkpointable +from tensorflow.python.training import checkpointable as checkpointable from tensorflow.python.training import saver as saver_lib @@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject): return self._restore_callback(tensor) -class _SplitDependency(core_checkpointable.CheckpointableBase): +class _SplitDependency(checkpointable.CheckpointableBase): """Looks like a regular variable while synchronizing save/restores.""" def __init__(self, save_buffer, restore_buffer, name, dtype, num_components, @@ -83,7 +83,7 @@ class _SplitDependency(core_checkpointable.CheckpointableBase): def _gather_saveables_for_checkpoint(self): """Looks to Checkpointable like a regular variable.""" return { - core_checkpointable.VARIABLE_VALUE_KEY: + checkpointable.VARIABLE_VALUE_KEY: functools.partial(_CallbackSaveable, dtype=self._dtype, save_callback=self._save, diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py similarity index 96% rename from tensorflow/contrib/eager/python/checkpointable_utils_test.py rename to tensorflow/contrib/checkpoint/python/split_dependency_test.py index da04199aaad..cb964c80e94 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py +++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py @@ -18,7 +18,7 @@ from __future__ import print_function import os -from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils +from tensorflow.contrib.checkpoint.python import split_dependency from tensorflow.python.eager import test from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops @@ -47,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase): def __init__(self): self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.]) - split_dependencies = contrib_checkpointable_utils.split_dependency( + split_dependencies = split_dependency.split_dependency( component_names=("first_half", "second_half"), component_dtypes=(self.combined.dtype,) * 2, fill_save_buffer_fn=_split_variable_closure( diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index 91839194c7c..fbcdf7e753d 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -130,6 +130,8 @@ tensorflow/contrib/boosted_trees/ops tensorflow/contrib/boosted_trees/proto tensorflow/contrib/boosted_trees/python tensorflow/contrib/boosted_trees/python/ops +tensorflow/contrib/checkpoint +tensorflow/contrib/checkpoint/python tensorflow/contrib/cloud tensorflow/contrib/cloud/kernels tensorflow/contrib/cloud/ops diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD index d68015ae156..aeefa3cee62 100644 --- a/tensorflow/contrib/cudnn_rnn/BUILD +++ b/tensorflow/contrib/cudnn_rnn/BUILD @@ -25,7 +25,7 @@ tf_custom_op_py_library( srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - "//tensorflow/contrib/eager/python:checkpointable_utils", + "//tensorflow/contrib/checkpoint/python:split_dependency", "//tensorflow/contrib/util:util_py", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index b615824460b..a1ede4471ef 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -17,7 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.eager.python import checkpointable_utils +from tensorflow.contrib.checkpoint.python import split_dependency from tensorflow.contrib.rnn.python.ops import lstm_ops from tensorflow.python.framework import common_shapes from tensorflow.python.framework import dtypes @@ -318,7 +318,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject): dependencies too (typically the cuDNN `Layer`). dtype: The dtype for the canonical parameter Tensors. """ - split_dependencies = checkpointable_utils.split_dependency( + split_dependencies = split_dependency.split_dependency( component_names=self._param_names, component_dtypes=(dtype,) * len(self._param_names), fill_save_buffer_fn=self._checkpointable_save, diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD index e2744a430d1..99abbae03fc 100644 --- a/tensorflow/contrib/eager/python/BUILD +++ b/tensorflow/contrib/eager/python/BUILD @@ -11,7 +11,6 @@ py_library( srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - ":checkpointable_utils", ":datasets", ":metrics", ":network", @@ -19,15 +18,14 @@ py_library( "//tensorflow/python:framework_ops", "//tensorflow/python:framework_test_lib", "//tensorflow/python:gradients", - "//tensorflow/python:numerics", "//tensorflow/python:resource_variable_ops", "//tensorflow/python:script_ops", "//tensorflow/python:template", + "//tensorflow/python:training", "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python/eager:backprop", "//tensorflow/python/eager:context", - "//tensorflow/python/eager:core", "//tensorflow/python/eager:execution_callbacks", "//tensorflow/python/eager:function", ], @@ -70,7 +68,6 @@ cuda_py_test( srcs = ["datasets_test.py"], additional_deps = [ ":datasets", - ":checkpointable_utils", "//tensorflow/contrib/data/python/ops:prefetching_ops", "//tensorflow/contrib/data/python/ops:threadpool", "//tensorflow/contrib/data/python/ops:unique", @@ -79,6 +76,7 @@ cuda_py_test( "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", "//tensorflow/python:script_ops", + "//tensorflow/python:training", "//tensorflow/python/data", "//tensorflow/python/eager:test", ], @@ -121,8 +119,8 @@ py_library( srcs_version = "PY2AND3", visibility = ["//tensorflow:internal"], deps = [ - "//tensorflow/contrib/eager/python:checkpointable_utils", "//tensorflow/python:array_ops", + "//tensorflow/python:checkpointable", "//tensorflow/python:control_flow_ops", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", @@ -225,43 +223,3 @@ py_test( "//tensorflow/python/eager:test", ], ) - -py_library( - name = "checkpointable_utils", - srcs = ["checkpointable_utils.py"], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:internal"], - deps = [ - "//tensorflow/python:control_flow_ops", - "//tensorflow/python:training", - ], -) - -cuda_py_test( - name = "checkpointable_utils_test", - srcs = ["checkpointable_utils_test.py"], - additional_deps = [ - ":checkpointable_utils", - ":network", - "@six_archive//:six", - "//tensorflow/python:constant_op", - "//tensorflow/python:dtypes", - "//tensorflow/python:framework_ops", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:init_ops", - "//tensorflow/python:layers", - "//tensorflow/python:layers_base", - "//tensorflow/python:resource_variable_ops", - "//tensorflow/python:state_ops", - "//tensorflow/python:training", - "//tensorflow/python:variable_scope", - "//tensorflow/python:variables", - "//tensorflow/python/eager:context", - "//tensorflow/python/eager:test", - "//tensorflow/python/keras", - ], - tags = [ - "no_windows", # TODO: needs investigation on Windows - "notsan", # b/74395663 - ], -) diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD index 85cfce346c5..5225ecc14fe 100644 --- a/tensorflow/contrib/optimizer_v2/BUILD +++ b/tensorflow/contrib/optimizer_v2/BUILD @@ -115,7 +115,6 @@ cuda_py_test( additional_deps = [ ":training", "@six_archive//:six", - "//tensorflow/contrib/eager/python:checkpointable_utils", "//tensorflow/python:constant_op", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 2ef105755f2..0ac5a5bb6dd 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -66,7 +66,6 @@ COMMON_PIP_DEPS = [ "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test", "//tensorflow/contrib/data/python/ops:contrib_op_loader", "//tensorflow/contrib/eager/python/examples:examples_pip", - "//tensorflow/contrib/eager/python:checkpointable_utils", "//tensorflow/contrib/eager/python:evaluator", "//tensorflow/contrib/gan:gan", "//tensorflow/contrib/graph_editor:graph_editor_pip", From 2273b62a769aa477f8d2ef02ca7dee253b8ea7b0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 19 Apr 2018 10:05:08 -0700 Subject: [PATCH 0426/1734] Added support for concatenation and slicing of symbolic shapes PiperOrigin-RevId: 193532769 --- ...direct_session_with_tracking_alloc_test.cc | 4 +- tensorflow/core/framework/shape_inference.cc | 2 + tensorflow/core/framework/shape_inference.h | 12 + .../core/grappler/costs/graph_properties.cc | 236 ++++++++++++++++-- 4 files changed, 235 insertions(+), 19 deletions(-) diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index 31fb128f937..b4dd521bbc8 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -102,9 +102,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); if (node->name() == y->name()) { - EXPECT_EQ(3, cm->AllocationId(node, 0)); + EXPECT_EQ(7, cm->AllocationId(node, 0)); } else { - EXPECT_EQ(4, cm->AllocationId(node, 0)); + EXPECT_EQ(8, cm->AllocationId(node, 0)); } } EXPECT_LE(0, cm->MaxExecutionTime(node)); diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc index 229b4a45fa9..2b995e8b5e8 100644 --- a/tensorflow/core/framework/shape_inference.cc +++ b/tensorflow/core/framework/shape_inference.cc @@ -157,8 +157,10 @@ InferenceContext::~InferenceContext() {} Status InferenceContext::Run( const std::function& fn) { + ForgetMerges(); Status s = fn(this); if (!s.ok()) { + ForgetMerges(); return AttachContext(s); } #ifndef NDEBUG diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h index cdb4bd79bbb..9431a62abef 100644 --- a/tensorflow/core/framework/shape_inference.h +++ b/tensorflow/core/framework/shape_inference.h @@ -285,6 +285,8 @@ class InferenceContext { return true; } + void SetInput(int idx, ShapeHandle shape) { inputs_[idx] = shape; } + ShapeHandle input(int64 idx) const { return inputs_[idx]; } Status input(StringPiece input_name, std::vector* output) const; int num_inputs() const { return inputs_.size(); } @@ -317,6 +319,10 @@ class InferenceContext { input_tensors_as_shapes_ = input_tensors_as_shapes; } + const std::vector& input_tensors_as_shapes() const { + return input_tensors_as_shapes_; + } + ShapeHandle output(int64 idx) const { return outputs_[idx]; } void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; } Status set_output(StringPiece output_name, @@ -587,6 +593,12 @@ class InferenceContext { int idx, const std::vector& shapes_and_types) TF_MUST_USE_RESULT; + void set_input_handle_shapes_and_types( + int idx, const std::vector& shapes_and_types) { + input_handle_shapes_and_types_[idx].reset( + new std::vector(shapes_and_types)); + } + // Returns the output handle shapes and types, for the resource tensor output // at index . Returns NULL if the shape and types were never set. const std::vector* output_handle_shapes_and_types(int idx) { diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index a9c777e5512..c83ddfe90a0 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -18,8 +18,9 @@ limitations under the License. #include #include #include -#include "tensorflow/core/common_runtime/shape_refiner.h" +#include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/grappler/costs/utils.h" #include "tensorflow/core/grappler/utils.h" @@ -394,15 +395,121 @@ class TopoQueue { // unknown shape/dimension of a given node. class SymbolicShapeRefiner { public: - explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner) - : shape_refiner_(shape_refiner) {} + explicit SymbolicShapeRefiner(const GraphDef& graph) + : function_library_(OpRegistry::Global(), graph.library()) { + graph_def_version_ = graph.versions().producer(); + node_to_context_.reserve(graph.node_size()); + } InferenceContext* GetContext(const Node* node) { - return shape_refiner_->GetContext(node); + auto it = node_to_context_.find(node); + if (it == node_to_context_.end()) { + return nullptr; + } + return it->second.inference_context.get(); } Status UpdateNode(const Node* node, bool relax, bool* refined) { - return shape_refiner_->UpdateNode(node, relax, refined); + NodeContext* node_context = GetNodeContext(node); + if (node_context == nullptr) { + TF_RETURN_IF_ERROR(AddNode(node)); + node_context = CHECK_NOTNULL(GetNodeContext(node)); + *refined = true; + } + // Check if the shapes of the nodes in the fan-in of this node have changed, + // and if they have, update the node input shapes. + InferenceContext* inference_context = node_context->inference_context.get(); + std::vector const_values(node->num_inputs()); + std::vector input_tensors(node->num_inputs(), nullptr); + std::vector input_tensors_as_shapes(node->num_inputs()); + + for (const Edge* e : node->in_edges()) { + if (e->IsControlEdge()) continue; + + int dst_input = e->dst_input(); + int src_output = e->src_output(); + + Node* input = e->src(); + NodeContext* c = GetNodeContext(input); + if (c == nullptr) { + return errors::FailedPrecondition( + "Input ", dst_input, " ('", input->name(), "') for '", node->name(), + "' was not previously added to ShapeRefiner."); + } + + if (input->IsConstant()) { + // Convert constant value into tensors. + if (const_values[dst_input].FromProto( + input->def().attr().at("value").tensor())) { + input_tensors[dst_input] = &const_values[dst_input]; + // Integer tensors of rank one can also be interpreted as a shape + // provided all their values are >= -1. + if (const_values[dst_input].dims() == 1 && + (const_values[dst_input].dtype() == DT_INT32 || + const_values[dst_input].dtype() == DT_INT64)) { + ShapeHandle tensor_shape = inference_context->Vector( + const_values[dst_input].NumElements()); + ShapeHandle shp; + if (inference_context + ->MakeShapeFromTensor(input_tensors[dst_input], + tensor_shape, &shp) + .ok()) { + input_tensors_as_shapes[dst_input] = shp; + } + } + } + } + + if (c->output_tensors_as_shapes.size() > src_output) { + input_tensors_as_shapes[dst_input] = + c->output_tensors_as_shapes[src_output]; + } + + DCHECK_GE(dst_input, 0); + if (!*refined && !inference_context->input(dst_input).SameHandle( + c->inference_context->output(src_output))) { + *refined = true; + } + inference_context->SetInput(dst_input, + c->inference_context->output(src_output)); + + if (!*refined && + inference_context->requested_input_tensor_as_partial_shape( + dst_input)) { + // The input value may have changed. Since we have no way to know if + // that's indeed the case, err on the safe side. + *refined = true; + } + + // Also propagate handle shape and dtype of edges which are carrying + // resource handles. + if (e->src()->output_type(src_output) == DT_RESOURCE) { + auto* outputs = + c->inference_context->output_handle_shapes_and_types(src_output); + if (!outputs) continue; + auto* inputs = + inference_context->input_handle_shapes_and_types(dst_input); + + if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) { + *refined = true; + } + inference_context->set_input_handle_shapes_and_types(dst_input, + *outputs); + } + } + + if (!*refined) { + // No input shape has changed, we're done + return Status::OK(); + } + + node_context->inference_context->set_input_tensors(input_tensors); + node_context->inference_context->set_input_tensors_as_shapes( + input_tensors_as_shapes); + + // Update the shapes of the outputs. + return InferShapes(node, node_context); } + Status SetUnknownShape(const Node* node, int output_port) { shape_inference::ShapeHandle shape = GetUnknownOutputShape(node, output_port); @@ -450,7 +557,7 @@ class SymbolicShapeRefiner { if (shape1.SameHandle(shape2)) { return shape1; } - InferenceContext* ctx = shape_refiner_->GetContext(node); + InferenceContext* ctx = GetContext(node); ShapeHandle merged = shape1; if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) { // Return either one since they're expected to represent the same value. @@ -495,7 +602,7 @@ class SymbolicShapeRefiner { if (shape1.SameHandle(shape2)) { return shape1; } - InferenceContext* ctx = shape_refiner_->GetContext(node); + InferenceContext* ctx = GetContext(node); ShapeHandle relaxed = shape1; const int rank = ctx->Rank(shape1); if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) { @@ -569,7 +676,7 @@ class SymbolicShapeRefiner { if (it != unknown_shapes_.end()) { return it->second; } - InferenceContext* c = shape_refiner_->GetContext(node); + InferenceContext* c = GetContext(node); ShapeHandle shp = c->UnknownShape(); unknown_shapes_[id] = shp; return shp; @@ -582,16 +689,114 @@ class SymbolicShapeRefiner { if (it != unknown_dims_.end()) { return it->second; } - InferenceContext* c = shape_refiner_->GetContext(node); + InferenceContext* c = GetContext(node); DimensionHandle dim = c->UnknownDim(); unknown_dims_[id] = dim; return dim; } - ShapeRefiner* shape_refiner_; + Status AddNode(const Node* node) { + // Create the inference context for this node. + std::vector input_shapes(node->num_inputs()); + std::vector>> + input_handle_shapes_and_types(node->num_inputs()); + std::vector input_tensors(node->num_inputs(), nullptr); + std::vector input_tensors_as_shapes; + NodeContext& node_ctx = node_to_context_[node]; + node_ctx.inference_context.reset(new InferenceContext( + graph_def_version_, &node->def(), node->op_def(), input_shapes, + input_tensors, input_tensors_as_shapes, + std::move(input_handle_shapes_and_types))); + const Status s = node_ctx.inference_context->construction_status(); + if (!s.ok()) { + node_ctx.inference_context.reset(nullptr); + } + return s; + } + + struct NodeContext { + std::unique_ptr inference_context; + std::vector output_tensors_as_shapes; + }; + + Status InferShapes(const Node* node, NodeContext* c) { + InferenceContext* ic = c->inference_context.get(); + + // Propagate shape tensors + if (node->type_string() == "Shape") { + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = c->inference_context->input(0); + } else if (node->type_string() == "ShapeN") { + c->output_tensors_as_shapes.resize(c->inference_context->num_inputs()); + for (int i = 0; i < c->inference_context->num_inputs(); ++i) { + c->output_tensors_as_shapes[i] = c->inference_context->input(i); + } + } else if (node->type_string() == "ConcatV2") { + bool valid = true; + ShapeHandle result; + for (int i = 0; i < ic->num_inputs() - 1; ++i) { + ShapeHandle input = ic->input_tensors_as_shapes()[i]; + if (!ic->RankKnown(input)) { + valid = false; + break; + } else if (i == 0) { + result = input; + } else { + TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result)); + } + } + if (valid) { + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = result; + } + } else if (node->type_string() == "Slice") { + ShapeHandle input = ic->input_tensors_as_shapes()[0]; + bool valid = ic->RankKnown(input); + const Tensor* slice_offset = ic->input_tensor(1); + valid &= slice_offset != nullptr && slice_offset->NumElements() == 1; + const Tensor* slice_size = ic->input_tensor(2); + valid &= slice_size != nullptr && slice_size->NumElements() == 1; + if (valid) { + int64 start = slice_offset->dtype() == DT_INT32 + ? slice_offset->flat()(0) + : slice_offset->flat()(0); + int64 end = start + (slice_size->dtype() == DT_INT32 + ? slice_size->flat()(0) + : slice_size->flat()(0)); + ShapeHandle result; + TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result)); + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = result; + } + } + + // Infer the shapes of output tensors. + const OpRegistrationData* op_reg_data; + Status s = function_library_.default_registry()->LookUp(node->type_string(), + &op_reg_data); + if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) { + // There is nothing more we can infer, annotate outputs with unknown + // shapes + return c->inference_context->Run(shape_inference::UnknownShape); + } + + return c->inference_context->Run(op_reg_data->shape_inference_fn); + } + + NodeContext* GetNodeContext(const Node* node) { + auto it = node_to_context_.find(node); + if (it == node_to_context_.end()) { + return nullptr; + } + return &it->second; + } + + int graph_def_version_; + std::unordered_map node_to_context_; std::unordered_map unknown_shapes_; std::unordered_map unknown_dims_; + FunctionLibraryDefinition function_library_; }; // Keep track of shapes and dimensions in a graph. @@ -977,9 +1182,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { item_.graph.library()); Graph graph(function_library); graph_ = &graph; - ShapeRefiner shape_refiner(graph.versions(), graph.op_registry()); - shape_refiner.set_require_shape_inference_fns(false); - shape_refiner.set_disable_constant_propagation(true); ImportGraphDefOptions options; // Graph optimization happens at the late stage of graph execution, // when colocation constraints are already validated previously and @@ -987,7 +1189,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { // is no need to validate colocation constraints again. options.validate_colocation_constraints = false; options.validate_shape = false; - Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner); + Status s = ImportGraphDef(options, item_.graph, &graph, nullptr); TF_RETURN_IF_ERROR(s); std::unordered_map> fed_ports; @@ -1041,7 +1243,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { } } - SymbolicShapeRefiner refiner(&shape_refiner); + SymbolicShapeRefiner refiner(item_.graph); // We propagate shapes through the graph in two phases. In the first phase, we // exclusively merge shapes but we do not propagate shapes through the @@ -1073,7 +1275,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { SymbolicShapeManager shape_manager; bool found_error = false; for (const Node* const node : graph.nodes()) { - auto node_ctx = shape_refiner.GetContext(node); + auto node_ctx = refiner.GetContext(node); if (!node_ctx) { continue; } @@ -1105,7 +1307,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { for (const Node* const node : graph.nodes()) { VLOG(3) << "Filling in graph properties for node: " << node->name(); - auto ctx = shape_refiner.GetContext(node); + auto ctx = refiner.GetContext(node); if (!ctx) { continue; } From bdcca449fc22cf1d8a1d6a2c01c3b67706d6023b Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Thu, 19 Apr 2018 10:14:09 -0700 Subject: [PATCH 0427/1734] Prototype for tf.data writer API. PiperOrigin-RevId: 193534333 --- .../contrib/data/python/kernel_tests/BUILD | 20 +++ .../python/kernel_tests/writer_ops_test.py | 117 ++++++++++++++++++ tensorflow/contrib/data/python/ops/BUILD | 13 ++ tensorflow/contrib/data/python/ops/writers.py | 58 +++++++++ .../base_api/api_def_DatasetToTFRecord.pbtxt | 24 ++++ tensorflow/core/framework/dataset.h | 4 +- tensorflow/core/kernels/data/BUILD | 14 +++ tensorflow/core/kernels/data/writer_ops.cc | 113 +++++++++++++++++ tensorflow/core/ops/dataset_ops.cc | 6 + 9 files changed, 367 insertions(+), 2 deletions(-) create mode 100644 tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py create mode 100644 tensorflow/contrib/data/python/ops/writers.py create mode 100644 tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt create mode 100644 tensorflow/core/kernels/data/writer_ops.cc diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index c554607960b..83daa04efc9 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -516,3 +516,23 @@ tf_py_test( "//third_party/py/numpy", ], ) + +tf_py_test( + name = "writer_ops_test", + size = "small", + srcs = ["writer_ops_test.py"], + additional_deps = [ + "//tensorflow/contrib/data/python/ops:writers", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:dataset_ops_gen", + "//tensorflow/python:dtypes", + "//tensorflow/python:errors", + "//tensorflow/python:framework_ops", + "//tensorflow/python:io_ops", + "//tensorflow/python:lib", + "//tensorflow/python:tensor_shape", + "//tensorflow/python:util", + "//tensorflow/python/data/ops:readers", + ], +) diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py new file mode 100644 index 00000000000..c603ecc5ab2 --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py @@ -0,0 +1,117 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the experimental input pipeline ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensorflow.contrib.data.python.ops import writers +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.ops import readers +from tensorflow.python.framework import dtypes +from tensorflow.python.lib.io import python_io +from tensorflow.python.lib.io import tf_record +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test +from tensorflow.python.util import compat + + +class TFRecordWriterTest(test.TestCase): + + def setUp(self): + super(TFRecordWriterTest, self).setUp() + self._num_records = 7 + self.filename = array_ops.placeholder(dtypes.string, shape=[]) + self.compression_type = array_ops.placeholder_with_default("", shape=[]) + + input_dataset = readers.TFRecordDataset([self.filename], + self.compression_type) + self.writer = writers.TFRecordWriter( + self._outputFilename(), self.compression_type).write(input_dataset) + + def _record(self, i): + return compat.as_bytes("Record %d" % (i)) + + def _createFile(self, options=None): + filename = self._inputFilename() + writer = python_io.TFRecordWriter(filename, options) + for i in range(self._num_records): + writer.write(self._record(i)) + writer.close() + return filename + + def _inputFilename(self): + return os.path.join(self.get_temp_dir(), "tf_record.in.txt") + + def _outputFilename(self): + return os.path.join(self.get_temp_dir(), "tf_record.out.txt") + + def testWrite(self): + with self.test_session() as sess: + sess.run( + self.writer, feed_dict={ + self.filename: self._createFile(), + }) + for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): + self.assertAllEqual(self._record(i), r) + + def testWriteZLIB(self): + options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) + with self.test_session() as sess: + sess.run( + self.writer, + feed_dict={ + self.filename: self._createFile(options), + self.compression_type: "ZLIB", + }) + for i, r in enumerate( + tf_record.tf_record_iterator(self._outputFilename(), options=options)): + self.assertAllEqual(self._record(i), r) + + def testWriteGZIP(self): + options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) + with self.test_session() as sess: + sess.run( + self.writer, + feed_dict={ + self.filename: self._createFile(options), + self.compression_type: "GZIP", + }) + for i, r in enumerate( + tf_record.tf_record_iterator(self._outputFilename(), options=options)): + self.assertAllEqual(self._record(i), r) + + def testFailDataset(self): + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write("whoops") + + def testFailDType(self): + input_dataset = dataset_ops.Dataset.from_tensors(10) + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write(input_dataset) + + def testFailShape(self): + input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]]) + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write(input_dataset) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index e00f2304cc4..5b04c5316cf 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -280,6 +280,18 @@ py_library( ], ) +py_library( + name = "writers", + srcs = [ + "writers.py", + ], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:dtypes", + "//tensorflow/python/data/ops:dataset_ops", + ], +) + tf_gen_op_wrapper_py( name = "gen_dataset_ops", out = "gen_dataset_ops.py", @@ -342,6 +354,7 @@ py_library( ":stats_ops", ":threadpool", ":unique", + ":writers", "//tensorflow/python:dataset_ops_gen", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py new file mode 100644 index 00000000000..f53bd3f7383 --- /dev/null +++ b/tensorflow/contrib/data/python/ops/writers.py @@ -0,0 +1,58 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python wrappers for tf.data writers.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.util import convert +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import gen_dataset_ops + + +class TFRecordWriter(object): + """Writes data to a TFRecord file.""" + + def __init__(self, filename, compression_type=None): + self._filename = ops.convert_to_tensor( + filename, dtypes.string, name="filename") + self._compression_type = convert.optional_param_to_tensor( + "compression_type", + compression_type, + argument_default="", + argument_dtype=dtypes.string) + + def write(self, dataset): + """Returns a @{tf.Operation} to write a dataset to a file. + + Args: + dataset: a @{tf.data.Dataset} whose elements are to be written to a file + + Returns: + A @{tf.Operation} that, when run, writes contents of `dataset` to a file. + """ + if not isinstance(dataset, dataset_ops.Dataset): + raise TypeError("`dataset` must be a `tf.data.Dataset` object.") + if (dataset.output_types != dtypes.string or + dataset.output_shapes != tensor_shape.scalar()): + raise TypeError( + "`dataset` must produce scalar `DT_STRING` tensors whereas it " + "produces shape {0} and types {1}".format(dataset.output_shapes, + dataset.output_types)) + return gen_dataset_ops.dataset_to_tf_record( + dataset._as_variant_tensor(), self._filename, self._compression_type) # pylint: disable=protected-access diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt new file mode 100644 index 00000000000..e1b8a9abdd2 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt @@ -0,0 +1,24 @@ +op { + graph_op_name: "DatasetToTFRecord" + visibility: HIDDEN + in_arg { + name: "input_dataset" + description: <& parent) { return parent->SaveInternal(writer); @@ -372,7 +372,7 @@ class IteratorBase { // This is needed so that sub-classes of IteratorBase can call // `RestoreInternal` on their parent iterators, e.g., in - // `RepeatDataasetOp::Dataset`. + // `RepeatDatasetOp::Dataset`. Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader, const std::unique_ptr& parent) { return parent->RestoreInternal(ctx, reader); diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 1e96eb6421d..667a6967a85 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -576,6 +576,20 @@ tf_kernel_library( ":tensor_queue_dataset_op", ":tensor_slice_dataset_op", ":unbatch_dataset_op", + ":writer_ops", ":zip_dataset_op", ], ) + +tf_kernel_library( + name = "writer_ops", + srcs = ["writer_ops.cc"], + deps = [ + ":dataset", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core/kernels:ops_util", + ], +) diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc new file mode 100644 index 00000000000..46821fd7b3a --- /dev/null +++ b/tensorflow/core/kernels/data/writer_ops.cc @@ -0,0 +1,113 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/record_writer.h" +#include "tensorflow/core/platform/file_system.h" + +namespace tensorflow { + +namespace { + +class ToTFRecordOp : public AsyncOpKernel { + public: + explicit ToTFRecordOp(OpKernelConstruction* ctx) + : AsyncOpKernel(ctx), + thread_pool_(new thread::ThreadPool( + ctx->env(), ThreadOptions(), + strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())), + 1 /* num_threads */, false /* low_latency_hint */)) {} + + template + Status ParseScalarArgument(OpKernelContext* ctx, + const StringPiece& argument_name, T* output) { + const Tensor* argument_t; + TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t)); + if (!TensorShapeUtils::IsScalar(argument_t->shape())) { + return errors::InvalidArgument(argument_name, " must be a scalar"); + } + *output = argument_t->scalar()(); + return Status::OK(); + } + + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { + // The call to `iterator->GetNext()` may block and depend on an + // inter-op thread pool thread, so we issue the call from the + // owned thread pool. + thread_pool_->Schedule([this, ctx, done]() { + string filename; + OP_REQUIRES_OK_ASYNC( + ctx, ParseScalarArgument(ctx, "filename", &filename), done); + string compression_type; + OP_REQUIRES_OK_ASYNC(ctx, + ParseScalarArgument(ctx, "compression_type", + &compression_type), + done); + std::unique_ptr file; + OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file), + done); + std::unique_ptr writer; + writer.reset(new io::RecordWriter( + file.get(), io::RecordWriterOptions::CreateRecordWriterOptions( + compression_type))); + + DatasetBase* dataset; + OP_REQUIRES_OK_ASYNC( + ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); + auto iterator = dataset->MakeIterator("ToTFRecordOpIterator"); + + IteratorContext::Params params; // TODO(b/78245447) + params.env = ctx->env(); + params.runner = *(ctx->runner()); + params.lib = ctx->function_library(); + DeviceBase* device = ctx->function_library()->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + + IteratorContext iter_ctx(std::move(params)); + + std::vector components; + components.reserve(dataset->output_dtypes().size()); + bool end_of_sequence; + + do { + OP_REQUIRES_OK_ASYNC( + ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence), + done); + + if (!end_of_sequence) { + OP_REQUIRES_OK_ASYNC( + ctx, writer->WriteRecord(components[0].scalar()()), done); + } + components.clear(); + } while (!end_of_sequence); + done(); + }); + } + + private: + std::unique_ptr thread_pool_; +}; + +REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU), + ToTFRecordOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index 8be569b315d..67c6c58fe2f 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -551,4 +551,10 @@ REGISTER_OP("EnqueueInQueueDataset") // reading from queue handle (is that even possible?). .SetShapeFn(shape_inference::NoOutputs); +REGISTER_OP("DatasetToTFRecord") + .Input("input_dataset: variant") + .Input("filename: string") + .Input("compression_type: string") + .SetShapeFn(shape_inference::NoOutputs); + } // namespace tensorflow From 5fbd21e3bbd4f89dd2c6eed8a63b66ee2eff40a0 Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Thu, 19 Apr 2018 10:20:43 -0700 Subject: [PATCH 0428/1734] distribution_util moved into its own BUILD target, so linear_operator can depend on it. PiperOrigin-RevId: 193535400 --- tensorflow/python/ops/distributions/BUILD | 26 ++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD index 9d9ede7ad75..e7ad028376b 100644 --- a/tensorflow/python/ops/distributions/BUILD +++ b/tensorflow/python/ops/distributions/BUILD @@ -8,9 +8,13 @@ licenses(["notice"]) # Apache 2.0 py_library( name = "distributions", - srcs = glob(["*.py"]), + srcs = glob( + ["*.py"], + exclude = ["util.py"], + ), srcs_version = "PY2AND3", deps = [ + ":util", "//tensorflow/python:array_ops", "//tensorflow/python:check_ops", "//tensorflow/python:control_flow_ops", @@ -26,3 +30,23 @@ py_library( "@six_archive//:six", ], ) + +py_library( + name = "util", + srcs = ["util.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:array_ops", + "//tensorflow/python:check_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:nn", + "//tensorflow/python:nn_ops", + "//tensorflow/python:random_ops", + "//tensorflow/python:special_math_ops", + "//tensorflow/python:tensor_util", + "//third_party/py/numpy", + "@six_archive//:six", + ], +) From 72240a9b5e67e315f6c037bb4579df9709335e35 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Fri, 20 Apr 2018 01:23:54 +0800 Subject: [PATCH 0429/1734] fix single paragraph format and also arrow like format --- tensorflow/contrib/optimizer_v2/adam.py | 16 ++++++++-------- .../api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++---- .../base_api/api_def_ResourceApplyAdam.pbtxt | 8 ++++---- tensorflow/python/training/adam.py | 16 ++++++++-------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index a38c98f4711..76a867039af 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - \\(m_0 <- 0\\) (Initialize initial 1st moment vector) - \\(v_0 <- 0\\) (Initialize initial 2nd moment vector) - \\(t <- 0\\) (Initialize timestep) + $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$ + $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$ + $$t \Leftarrow 0 (Initialize timestep)$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - $$t <- t + 1$$ - $$lr_t <- \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$t \Leftarrow t + 1$$ + $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - $$m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g$$ - $$v_t <- beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ - $$variable <- variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ + $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index fc2cb094716..fca8ba25306 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Thu, 19 Apr 2018 10:26:26 -0700 Subject: [PATCH 0430/1734] Fix doc gen error Mismatch after the fix in #17815 --- tensorflow/contrib/tensor_forest/ops/stats_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc index be0a11546d2..5be581aaec4 100644 --- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc +++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc @@ -75,7 +75,7 @@ REGISTER_OP("GrowTreeV4") .Attr("params: string") .Input("tree_handle: resource") .Input("stats_handle: resource") - .Input("finshed_nodes: int32") + .Input("finished_nodes: int32") .SetShapeFn(tensorflow::shape_inference::NoOutputs) .Doc(R"doc( Grows the tree for finished nodes and allocates waiting nodes. From ba3bc495bbf1140e9375e1ec03c3ff788b8ebc6e Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Thu, 19 Apr 2018 10:26:54 -0700 Subject: [PATCH 0431/1734] Add metric names to model.metrics_names in compile for keras models run in eager execution. This prevents us from dropping metrics when we run model.evaluate. PiperOrigin-RevId: 193536341 --- .../keras/_impl/keras/engine/training.py | 29 ++------- .../_impl/keras/engine/training_eager.py | 39 ++++-------- .../_impl/keras/engine/training_eager_test.py | 12 ++-- .../keras/_impl/keras/engine/training_test.py | 26 ++++++++ .../_impl/keras/engine/training_utils.py | 62 +++++++++++++++++++ 5 files changed, 109 insertions(+), 59 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py index 7c467438145..012d9ceea43 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training.py +++ b/tensorflow/python/keras/_impl/keras/engine/training.py @@ -276,6 +276,8 @@ class Model(Network): self.metrics_names.append(self.output_names[i] + '_loss') self.nested_metrics = training_utils.collect_metrics(metrics, self.output_names) + with K.name_scope('metrics'): + training_utils.populate_metric_names(self) self._feed_sample_weight_modes = [] for i in range(len(self.outputs)): self._feed_sample_weight_modes.append(None) @@ -462,7 +464,6 @@ class Model(Network): output_weighted_metrics = nested_weighted_metrics[i] def handle_metrics(metrics, weights=None): - metric_name_prefix = 'weighted_' if weights is not None else '' for metric in metrics: if metric in ('accuracy', 'acc', 'crossentropy', 'ce'): @@ -489,39 +490,19 @@ class Model(Network): metric_fn = metrics_module.categorical_accuracy elif metric in ('crossentropy', 'ce'): metric_fn = metrics_module.categorical_crossentropy - if metric in ('accuracy', 'acc'): - suffix = 'acc' - elif metric in ('crossentropy', 'ce'): - suffix = 'ce' weighted_metric_fn = training_utils.weighted_masked_objective( metric_fn) - metric_name = metric_name_prefix + suffix else: metric_fn = metrics_module.get(metric) weighted_metric_fn = training_utils.weighted_masked_objective( metric_fn) - # Get metric name as string - if hasattr(metric_fn, 'name'): - metric_name = metric_fn.name - else: - metric_name = metric_fn.__name__ - metric_name = metric_name_prefix + metric_name - + metric_name = training_utils.get_base_metric_name( + metric, weighted=weights is not None) with K.name_scope(metric_name): metric_result = weighted_metric_fn( y_true, y_pred, weights=weights, mask=masks[i]) - # Append to self.metrics_names, self.metric_tensors, - # self.stateful_metric_names - if len(self.output_names) > 1: - metric_name = '%s_%s' % (self.output_names[i], metric_name) - # Dedupe name - j = 1 - base_metric_name = metric_name - while metric_name in self.metrics_names: - metric_name = '%s_%d' % (base_metric_name, j) - j += 1 - self.metrics_names.append(metric_name) + training_utils.add_metric_name(self, metric_name, i) self.metrics_tensors.append(metric_result) # Keep track of state updates created by diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py index 695669d9ee1..ad239d6151e 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py @@ -100,7 +100,7 @@ def _eager_metrics_fn(model, outputs, targets): metric_names.append(metric_name) metric_results.append(backend.mean(metric_result)) - return metric_names, metric_results + return metric_results def _model_loss(model, inputs, targets, sample_weights=None, training=False): @@ -151,7 +151,12 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False): with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn( targets[i], outs[i], weights, mask=mask) - loss_metrics.append(backend.mean(output_loss)) + # If the number of outputs is 1 then we don't append the loss metric + # associated with each model output. When there are multiple outputs + # associated with a model, each output's loss is calculated and returned + # as part of the loss_metrics. + if len(model.outputs) > 1: + loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: @@ -274,7 +279,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None): model, inputs, targets, sample_weights=sample_weights, training=True) if not isinstance(outs, list): outs = [outs] - _, metrics_results = _eager_metrics_fn( + metrics_results = _eager_metrics_fn( model, outs, targets) if not isinstance(loss, list): loss = [loss] @@ -304,7 +309,7 @@ def test_on_batch(model, inputs, targets, sample_weights=None): model, inputs, targets, sample_weights=sample_weights, training=False) if not isinstance(outs, list): outs = [outs] - _, metrics_results = _eager_metrics_fn( + metrics_results = _eager_metrics_fn( model, outs, targets) if not isinstance(loss, list): loss = [loss] @@ -498,34 +503,12 @@ def fit_loop( for l, o in zip(out_labels, outs): batch_logs[l] = o # Required for Eager mode - metrics_names, metrics_results = _eager_metrics_fn( - model, outs, targets_batch) + metrics_results = _eager_metrics_fn(model, outs, targets_batch) batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss)) - # TODO(anjalisridhar): Move this to compile to avoid duplicate code. - # In graph mode we set the metric names in compile. However in - # Eager mode we calculate the metrics for each batch in fit_loop. - # We could calculate the metric names and functions in compile. - # This would avoid setting the callback parameters separately. - # We need to do this for the first iteration alone - for m in metrics_names: - if m not in callback_metrics: - callback_metrics.append(m) - - callbacks.set_params({ - 'batch_size': batch_size, - 'epochs': epochs, - 'steps': steps_per_epoch, - 'samples': num_train_samples, - 'verbose': verbose, - 'do_validation': do_validation, - 'metrics': callback_metrics or [], - }) - for k, v in zip(model.metrics_names, [backend.mean(loss)] + loss_metrics + metrics_results): batch_logs[k] = tensor_util.constant_value(v) - callbacks.on_batch_end(batch_index, batch_logs) if callback_model.stop_training: break @@ -611,7 +594,7 @@ def test_loop(model, inputs, targets, targets_batch, sample_weights=sample_weights_batch, training=False) - _, metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch) + metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch) batch_outs = [] for _, v in zip(model.metrics_names, [backend.mean(loss)] + loss_metrics + metrics_results): diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py index ed0f91ee1e2..deaf1d13064 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py @@ -212,7 +212,7 @@ class TrainingTest(test.TestCase): optimizer = RMSPropOptimizer(learning_rate=0.001) loss = 'mse' loss_weights = [1., 0.5] - metrics = ['mae'] + metrics = ['acc', 'mae'] model.compile( optimizer, loss, @@ -231,20 +231,20 @@ class TrainingTest(test.TestCase): [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=0) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.evaluate( [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=1) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.evaluate( [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=2) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.test_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np]) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) # Test evaluate with dictionary inputs model.evaluate( @@ -625,7 +625,6 @@ class LossWeightingTest(test.TestCase): bad_w_np = np.random.random((10, 2, 2)) model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) - class CorrectnessTest(test.TestCase): @tf_test_util.run_in_graph_and_eager_modes() @@ -649,7 +648,6 @@ class CorrectnessTest(test.TestCase): self.assertEqual( np.around(history.history['loss'][-1], decimals=4), 0.6173) - if __name__ == '__main__': ops.enable_eager_execution() test.main() diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py index 6699fd5212f..d9281436dee 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py @@ -24,12 +24,15 @@ import unittest import numpy as np from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util as tf_test_util from tensorflow.python.keras._impl import keras from tensorflow.python.keras._impl.keras import testing_utils from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays from tensorflow.python.ops import array_ops from tensorflow.python.platform import test +from tensorflow.python.training.rmsprop import RMSPropOptimizer + try: import scipy.sparse as scipy_sparse # pylint: disable=g-import-not-at-top @@ -1684,6 +1687,29 @@ class TestTrainingWithDataTensors(test.TestCase): model.train_on_batch([input_a_np, input_b_np], [output_a_np, output_b_np]) + @tf_test_util.run_in_graph_and_eager_modes() + def test_metric_names_are_identical_in_graph_and_eager(self): + a = keras.layers.Input(shape=(3,), name='input_a') + b = keras.layers.Input(shape=(3,), name='input_b') + + dense = keras.layers.Dense(4, name='dense') + c = dense(a) + d = dense(b) + e = keras.layers.Dropout(0.5, name='dropout')(c) + + model = keras.models.Model([a, b], [d, e]) + + optimizer = RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + loss_weights = [1., 0.5] + metrics = ['mae', 'acc'] + model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights) + reference_metric_names = ['loss', 'dense_loss', 'dropout_loss', + 'dense_mean_absolute_error', + 'dense_acc', + 'dropout_mean_absolute_error', + 'dropout_acc'] + self.assertEqual(reference_metric_names, model.metrics_names) if __name__ == '__main__': # Bazel sets these environment variables to very long paths. diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py index 48afe48e6c0..662938f421b 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py @@ -26,6 +26,7 @@ from tensorflow.python.eager import context from tensorflow.python.framework import tensor_util from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import losses +from tensorflow.python.keras._impl.keras import metrics as metrics_module from tensorflow.python.ops import math_ops @@ -552,3 +553,64 @@ def standardize_weights(y, def has_symbolic_tensors(ls): return (any(tensor_util.is_tensor(v) for v in ls) and not context.executing_eagerly()) + + +def populate_metric_names(model): + for i in range(len(model.outputs)): + metrics = model.nested_metrics[i] + for metric in metrics: + base_metric_name = get_base_metric_name(metric) + add_metric_name(model, base_metric_name, i) + + +def get_base_metric_name(metric, weighted=False): + """Returns the metric name given the metric function. + + Arguments: + metric: Metric function name or reference. + weighted: Boolean indicating if the metric for which we are adding + names is weighted. + + Returns: + a metric name. + """ + metric_name_prefix = 'weighted_' if weighted else '' + if metric in ('accuracy', 'acc', 'crossentropy', 'ce'): + if metric in ('accuracy', 'acc'): + suffix = 'acc' + elif metric in ('crossentropy', 'ce'): + suffix = 'ce' + metric_name = metric_name_prefix + suffix + else: + metric_fn = metrics_module.get(metric) + # Get metric name as string + if hasattr(metric_fn, 'name'): + metric_name = metric_fn.name + else: + metric_name = metric_fn.__name__ + metric_name = metric_name_prefix + metric_name + + return metric_name + + +def add_metric_name(model, metric_name, index): + """Makes the metric name unique and adds it to the model's metric name list. + + If there are multiple outputs for which the metrics are calculated, the + metric names have to be made unique by appending an integer. + + Arguments: + model: Model to which we are adding metric names. + metric_name: Metric name that corresponds to the metric specified by the + user. For example: 'acc' + index: The index of the model output for which the metric name is being + added. + """ + if len(model.output_names) > 1: + metric_name = '%s_%s' % (model.output_names[index], metric_name) + j = 1 + base_metric_name = metric_name + while metric_name in model.metrics_names: + metric_name = '%s_%d' % (base_metric_name, j) + j += 1 + model.metrics_names.append(metric_name) From 6a7779f3384e48012d3e27ae0f48d410f5174d06 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 10:33:42 -0700 Subject: [PATCH 0432/1734] Fix undefined signed integer overflow by performing addition more carefully. PiperOrigin-RevId: 193537461 --- .../core/lib/random/random_distributions.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h index 4cf3a999f67..e963511f5cf 100644 --- a/tensorflow/core/lib/random/random_distributions.h +++ b/tensorflow/core/lib/random/random_distributions.h @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/lib/bfloat16/bfloat16.h" @@ -40,6 +41,20 @@ PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x); // Helper function to convert two 32-bit integers to a double between [0..1). PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1); +// Computes a + b. Requires that the result is representable in the destination +// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b +// need *not* be representable in that type. (The condition on b excludes the +// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot +// compute.) +template +PHILOX_DEVICE_INLINE Int SignedAdd(Int a, + typename std::make_unsigned::type b) { + // Implementation note: both b_div_2 and b - b_div_2 are positive and + // representatble as Int. + auto b_div_2 = b >> 1; + return a + static_cast(b_div_2) + static_cast(b - b_div_2); +} + // A class that generates uniform distribution random numbers from the // underlying random integer generator. // Arguments: @@ -172,7 +187,7 @@ class UniformDistribution { typename Generator::ResultType sample = (*gen)(); ResultType result; for (int i = 0; i < kResultElementCount; ++i) { - result[i] = lo_ + static_cast(sample[i] % range_); + result[i] = SignedAdd(lo_, sample[i] % range_); } return result; } @@ -208,7 +223,7 @@ class UniformDistribution { ResultType result; for (int i = 0; i < kResultElementCount; ++i) { auto bits = sample[2 * i] | static_cast(sample[2 * i + 1]) << 32; - result[i] = lo_ + static_cast(bits % range_); + result[i] = SignedAdd(lo_, bits % range_); } return result; } From 430230b4b966cade863ea5b660862734ede1cc56 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Fri, 20 Apr 2018 01:37:03 +0800 Subject: [PATCH 0433/1734] Fix minor pylint issue --- tensorflow/contrib/losses/python/losses/loss_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py index 5af1f21b11d..bdad34a665e 100644 --- a/tensorflow/contrib/losses/python/losses/loss_ops.py +++ b/tensorflow/contrib/losses/python/losses/loss_ops.py @@ -652,7 +652,7 @@ def cosine_distance(predictions, ValueError: If `predictions` shape doesn't match `labels` shape, or `weights` is `None`. """ - axis = deprecation.deprecated_argument_lookup( + axis = deprecated_argument_lookup( "axis", axis, "dim", dim) if axis is None: raise ValueError("You must specify 'axis'.") From f196351cd4e21ed6c17dcf544e0fa6cfa3030b4e Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Thu, 19 Apr 2018 10:57:55 -0700 Subject: [PATCH 0434/1734] Allow non-isolated worker sessions to borrow `WorkerEnv::device_mgr`. Without this change, a shared resource (e.g. an Iterator) could not be created in one session `s1`, and used in a later session `s2` after `s1` was closed, because the iterator might indirectly capture devices from the previous session, and use them after they are freed when the `WorkerSession` was deleted. The current change only affects the singleton "legacy" WorkerSession, which is never deleted, but this is necessary to switch all sessions to use separate WorkerSession objects. PiperOrigin-RevId: 193541426 --- tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc | 2 +- tensorflow/core/distributed_runtime/BUILD | 1 + .../base_rendezvous_mgr.cc | 4 +- .../rpc/rpc_rendezvous_mgr.cc | 2 +- .../core/distributed_runtime/session_mgr.cc | 42 ++++++++++++------- .../core/distributed_runtime/session_mgr.h | 2 +- .../distributed_runtime/session_mgr_test.cc | 23 +++++----- .../distributed_runtime/worker_session.cc | 38 ++++++++++++++++- .../core/distributed_runtime/worker_session.h | 28 +++++++++++-- 9 files changed, 106 insertions(+), 36 deletions(-) diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc index 28f68cec8cc..94f522c04e5 100644 --- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc +++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc @@ -155,7 +155,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous { } Device* dst_device; - Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device); + Status s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device); if (!s.ok()) { sess->worker_cache->ReleaseWorker(src_worker, rwi); done(s, Args(), recv_args, Tensor{}, false); diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index b07cb8cdcb3..d564727da50 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -133,6 +133,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "//tensorflow/core:ptr_util", "//tensorflow/core:worker_proto_cc", ], ) diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc index bafd9bfc68a..5f6931e0088 100644 --- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc +++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc @@ -253,13 +253,13 @@ void BaseRemoteRendezvous::SameWorkerRecvDone( WorkerSession* sess = session(); Device* src_device; - Status s = sess->device_mgr->LookupDevice(parsed.src_device, &src_device); + Status s = sess->device_mgr()->LookupDevice(parsed.src_device, &src_device); if (!s.ok()) { done(s); return; } Device* dst_device; - s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device); + s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device); if (!s.ok()) { done(s); return; diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc index 067dc5dff5b..b8cb5385038 100644 --- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc +++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc @@ -227,7 +227,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync( Device* dst_device; if (s.ok()) { - s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device); + s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device); } if (!s.ok()) { if (rwi != nullptr) { diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc index e51d63cf2ba..357e9f8930f 100644 --- a/tensorflow/core/distributed_runtime/session_mgr.cc +++ b/tensorflow/core/distributed_runtime/session_mgr.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/protobuf/cluster.pb.h" #include "tensorflow/core/protobuf/tensorflow_server.pb.h" +#include "tensorflow/core/util/ptr_util.h" namespace tensorflow { @@ -33,11 +34,11 @@ SessionMgr::SessionMgr( WorkerCacheFactory worker_cache_factory) : worker_env_(worker_env), default_worker_cache_(std::move(default_worker_cache)), - legacy_session_(new WorkerSession( + legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr( "", default_worker_name, std::unique_ptr( new WorkerCacheWrapper(default_worker_cache_.get())), - std::unique_ptr(worker_env->device_mgr), + worker_env->device_mgr, std::unique_ptr( new GraphMgr(worker_env, worker_env->device_mgr)))), worker_cache_factory_(std::move(worker_cache_factory)) {} @@ -71,19 +72,32 @@ Status SessionMgr::CreateSession(const string& session, CHECK(!worker_env_->local_devices.empty()) << "The WorkerEnv must have at least one device in `local_devices`."; - std::vector renamed_devices; - for (Device* d : worker_env_->local_devices) { - renamed_devices.push_back(RenamedDevice::NewRenamedDevice( - worker_name, d, false, isolate_session_state)); + std::shared_ptr worker_session; + + if (isolate_session_state) { + // Create a private copy of the DeviceMgr for the WorkerSession. + std::vector renamed_devices; + for (Device* d : worker_env_->local_devices) { + renamed_devices.push_back(RenamedDevice::NewRenamedDevice( + worker_name, d, false, isolate_session_state)); + } + + auto device_mgr = MakeUnique(renamed_devices); + auto graph_mgr = MakeUnique(worker_env_, device_mgr.get()); + worker_session.reset( + new WorkerSession(session, worker_name, + std::unique_ptr(worker_cache), + std::move(device_mgr), std::move(graph_mgr))); + } else { + // Borrown the WorkerEnv's DeviceMgr for the WorkerSession, so + // that resources using it can use its devices after the + // WorkerSession has been deleted. + auto graph_mgr = MakeUnique(worker_env_, worker_env_->device_mgr); + worker_session = WorkerSession::CreateWithBorrowedDeviceMgr( + session, worker_name, + std::unique_ptr(worker_cache), + worker_env_->device_mgr, std::move(graph_mgr)); } - std::unique_ptr device_mgr(new DeviceMgr(renamed_devices)); - - std::unique_ptr graph_mgr( - new GraphMgr(worker_env_, device_mgr.get())); - - std::shared_ptr worker_session(new WorkerSession( - session, worker_name, std::unique_ptr(worker_cache), - std::move(device_mgr), std::move(graph_mgr))); sessions_.insert(std::make_pair(session, std::move(worker_session))); return Status::OK(); diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h index 0a10fe240f2..04d1d614098 100644 --- a/tensorflow/core/distributed_runtime/session_mgr.h +++ b/tensorflow/core/distributed_runtime/session_mgr.h @@ -65,7 +65,7 @@ class SessionMgr { void ClearLogs(); private: - const WorkerEnv* const worker_env_; // Not owned. + WorkerEnv* const worker_env_; // Not owned. // A note about destruction: // We must delete graph_mgr before device_mgr, due to shared diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc index 858e636e088..0da333833ad 100644 --- a/tensorflow/core/distributed_runtime/session_mgr_test.cc +++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc @@ -43,15 +43,17 @@ class FakeDevice : public Device { class SessionMgrTest : public ::testing::Test { protected: SessionMgrTest() - : device_(FakeDevice::MakeCPU( - "/job:mnist/replica:0/task:0/device:fakecpu:0")), - mgr_(&env_, "/job:mnist/replica:0/task:0", + : mgr_(&env_, "/job:mnist/replica:0/task:0", std::unique_ptr(), factory_) { - TF_CHECK_OK(mgr_.WorkerSessionForSession("", &legacy_session_)); - env_.local_devices = {device_.get()}; + Device* device = + FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0") + .release(); + env_.local_devices = {device}; + device_mgr_.reset(new DeviceMgr(env_.local_devices)); + env_.device_mgr = device_mgr_.get(); } - std::unique_ptr device_; + std::unique_ptr device_mgr_; WorkerEnv env_; SessionMgr::WorkerCacheFactory factory_ = [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) { @@ -59,7 +61,6 @@ class SessionMgrTest : public ::testing::Test { return Status::OK(); }; SessionMgr mgr_; - std::shared_ptr legacy_session_; }; TEST_F(SessionMgrTest, CreateSessionSimple) { @@ -84,25 +85,25 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) { TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false)); std::shared_ptr session_1; TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_1", &session_1)); - std::vector devices_1 = session_1->device_mgr->ListDevices(); + std::vector devices_1 = session_1->device_mgr()->ListDevices(); EXPECT_EQ(1, devices_1.size()); TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false)); std::shared_ptr session_2; TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_2", &session_2)); - std::vector devices_2 = session_2->device_mgr->ListDevices(); + std::vector devices_2 = session_2->device_mgr()->ListDevices(); EXPECT_EQ(1, devices_2.size()); TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true)); std::shared_ptr session_3; TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_3", &session_3)); - std::vector devices_3 = session_3->device_mgr->ListDevices(); + std::vector devices_3 = session_3->device_mgr()->ListDevices(); EXPECT_EQ(1, devices_3.size()); TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true)); std::shared_ptr session_4; TF_EXPECT_OK(mgr_.WorkerSessionForSession("handle_4", &session_4)); - std::vector devices_4 = session_4->device_mgr->ListDevices(); + std::vector devices_4 = session_4->device_mgr()->ListDevices(); EXPECT_EQ(1, devices_4.size()); EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager()); diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc index 18886babd5f..ca6dc1b1dea 100644 --- a/tensorflow/core/distributed_runtime/worker_session.cc +++ b/tensorflow/core/distributed_runtime/worker_session.cc @@ -95,9 +95,43 @@ WorkerSession::WorkerSession(const string& session_name, : session_name(session_name), worker_name(worker_name), worker_cache(new WorkerFreeListCache(std::move(worker_cache))), - device_mgr(std::move(device_mgr)), graph_mgr(std::move(graph_mgr)), cluster_flr( - new ClusterFunctionLibraryRuntime(this, !session_name.empty())) {} + new ClusterFunctionLibraryRuntime(this, !session_name.empty())), + device_mgr_(std::move(device_mgr)), + borrowed_device_mgr_(nullptr) {} + +/* static */ +std::shared_ptr WorkerSession::CreateWithBorrowedDeviceMgr( + const string& session_name, const string& worker_name, + std::unique_ptr worker_cache, + DeviceMgr* borrowed_device_mgr, std::unique_ptr graph_mgr) { + return std::shared_ptr( + new WorkerSession(session_name, worker_name, std::move(worker_cache), + borrowed_device_mgr, std::move(graph_mgr))); +} + +WorkerSession::WorkerSession(const string& session_name, + const string& worker_name, + std::unique_ptr worker_cache, + DeviceMgr* borrowed_device_mgr, + std::unique_ptr graph_mgr) + : session_name(session_name), + worker_name(worker_name), + worker_cache(new WorkerFreeListCache(std::move(worker_cache))), + graph_mgr(std::move(graph_mgr)), + cluster_flr( + new ClusterFunctionLibraryRuntime(this, !session_name.empty())), + device_mgr_(nullptr), + borrowed_device_mgr_(borrowed_device_mgr) {} + +WorkerSession::~WorkerSession() { + if (graph_mgr) { + Status s = graph_mgr->DeregisterAll(); + if (!s.ok()) { + LOG(WARNING) << "Error during worker session deletion: " << s; + } + } +} } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h index 0fd19ac27f2..f1faf493647 100644 --- a/tensorflow/core/distributed_runtime/worker_session.h +++ b/tensorflow/core/distributed_runtime/worker_session.h @@ -40,10 +40,14 @@ struct WorkerSession { // Object from which WorkerInterface instances can be obtained. const std::unique_ptr worker_cache; - // Collection of local devices. These devices are typically RenamedDevices - // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr - // == worker_env_.device_mgr, which holds the true devices. - const std::unique_ptr device_mgr; + // Collection of local devices. These devices are typically + // RenamedDevices in all except the SessionMgr.legacy_session_ and + // sessions created with `isolate_session_state == false`. In the + // those cases, this method returns a pointer to a borrowed + // DeviceMgr (typically the `worker_env.device_mgr`). + DeviceMgr* device_mgr() { + return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_; + } // graph_mgr keeps track of the registered graphs of this session. // @@ -57,6 +61,22 @@ struct WorkerSession { std::unique_ptr worker_cache, std::unique_ptr device_mgr, std::unique_ptr graph_mgr); + + static std::shared_ptr CreateWithBorrowedDeviceMgr( + const string& session_name, const string& worker_name, + std::unique_ptr worker_cache, + DeviceMgr* borrowed_device_mgr, std::unique_ptr graph_mgr); + + ~WorkerSession(); + + private: + WorkerSession(const string& session_name, const string& worker_name, + std::unique_ptr worker_cache, + DeviceMgr* borrowed_device_mgr, + std::unique_ptr graph_mgr); + + const std::unique_ptr device_mgr_; + DeviceMgr* const borrowed_device_mgr_; // Not owned. }; } // namespace tensorflow From e77bb988e470d35aca3ea1e27a4f335409f1f4d2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 10:59:08 -0700 Subject: [PATCH 0435/1734] Fix open source BUILD bugs for cloud profiler. Increment version for releasing cloud_tpu_profiler 1.6 with pod profiling support. PiperOrigin-RevId: 193541692 --- .../tpu/profiler/capture_tpu_profile.cc | 12 +++++----- .../pip_package/cloud_tpu_profiler/main.py | 23 +++++++++++++++++-- .../contrib/tpu/profiler/pip_package/setup.py | 2 +- tensorflow/contrib/tpu/profiler/version.h | 2 +- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc index a5358842630..816897499b7 100644 --- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc @@ -41,7 +41,7 @@ namespace tensorflow { namespace tpu { namespace { -using ::tensorflow::grpc::TPUProfileAnalysis; +using ::tensorflow::TPUProfileAnalysis; using ::tensorflow::TPUProfiler; constexpr uint64 kMaxEvents = 1000000; @@ -137,9 +137,9 @@ bool NewSession(const string& service_addr, PopulateProfileRequest(duration_ms, repository_root, session_id, opts); new_session_request.set_repository_root(repository_root); new_session_request.set_session_id(session_id); - std::copy( - hostnames.begin(), hostnames.end(), - proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts())); + for (const auto& hostname : hostnames) { + new_session_request.add_hosts(hostname); + } ::grpc::ClientContext context; ::grpc::ChannelArguments channel_args; @@ -159,8 +159,8 @@ bool NewSession(const string& service_addr, TF_QCHECK_OK(FromGrpcStatus( stub->NewSession(&context, new_session_request, &new_session_response))); - std::cout << "Profile session succeed for hosts:" - << str_util::Join(hostnames, ","); + std::cout << "Profile session succeed for host(s):" + << str_util::Join(hostnames, ",") << std::endl; return new_session_response.empty_trace(); } diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py index 0b78cf86950..508c7a842fb 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py @@ -37,12 +37,17 @@ flags.DEFINE_string( 'will attempt to automatically detect the GCE project from metadata.') flags.DEFINE_string('tpu_name', None, 'Name of the Cloud TPU for Cluster Resolvers. You must ' - 'specify either this flag or --master.') + 'specify either this flag or --service_addr.') # Tool specific parameters flags.DEFINE_string( 'service_addr', None, 'Address of TPU profiler service e.g. ' 'localhost:8466, you must specify either this flag or --tpu_name.') +flags.DEFINE_string( + 'workers_list', None, 'The list of worker TPUs that we are about to profile' + ' e.g. 10.0.1.2, 10.0.1.3. You can specify this flag with --tpu_name or ' + '--service_addr to profile a subset of tpu nodes. You can also use only' + '--tpu_name and leave this flag unspecified to profile all the tpus.') flags.DEFINE_string('logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, ' 'gs://tb_bucket') @@ -56,18 +61,25 @@ flags.DEFINE_boolean('include_dataset_ops', True, FLAGS = flags.FLAGS EXECUTABLE = 'data/capture_tpu_profile' +JOB_NAME = 'worker' +def get_workers_list(cluster_resolver): + cluster_spec = cluster_resolver.cluster_spec() + task_indices = cluster_spec.task_indices(JOB_NAME) + workers_list = [cluster_spec.task_address(JOB_NAME, i).split(':')[0] + for i in task_indices] + return ','.join(workers_list) def run_main(): tf.app.run(main) - def main(unused_argv=None): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.service_addr is None and FLAGS.tpu_name is None: sys.exit('You must specify either --service_addr or --tpu_name.') + tpu_cluster_resolver = None if FLAGS.service_addr is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring ' @@ -82,6 +94,12 @@ def main(unused_argv=None): service_addr = tpu_cluster_resolver.get_master() service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466') + workers_list = "" + if FLAGS.workers_list is not None: + workers_list = FLAGS.workers_list + elif tpu_cluster_resolver is not None: + workers_list = get_workers_list(tpu_cluster_resolver) + if not FLAGS.logdir: sys.exit('logdir must be provided.') executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE) @@ -89,6 +107,7 @@ def main(unused_argv=None): cmd = [executable_path] cmd.append('--logdir=' + logdir) cmd.append('--service_addr=' + service_addr) + cmd.append('--workers_list=' + workers_list) cmd.append('--duration_ms=' + str(FLAGS.duration_ms)) cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts)) cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower()) diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py index 8d99835b641..ebd478fd022 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py @@ -20,7 +20,7 @@ from __future__ import print_function from setuptools import setup -_VERSION = '1.6.0-rc1' +_VERSION = '1.6.0' CONSOLE_SCRIPTS = [ 'capture_tpu_profile=cloud_tpu_profiler.main:run_main', diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h index dc6a9348911..618479e1a6c 100644 --- a/tensorflow/contrib/tpu/profiler/version.h +++ b/tensorflow/contrib/tpu/profiler/version.h @@ -16,6 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ -#define TPU_PROFILER_VERSION "1.5.0" +#define TPU_PROFILER_VERSION "1.6.0" #endif // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ From 62c3b7dece92a3ad1a39e7c4eb0894411e435258 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 11:08:08 -0700 Subject: [PATCH 0436/1734] Updating tests in constant_folding_test.cc so that they Evaluate the optimized and original graph and check if their outputs are the same. PiperOrigin-RevId: 193543478 --- .../optimizers/constant_folding_test.cc | 52 +++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc index 36625b68b77..1acce05909c 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc @@ -689,8 +689,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) { GrapplerItem item; item.fetch.push_back("e"); TF_CHECK_OK(scope.ToGraphDef(&item.graph)); - auto tensors_expected = EvaluateNodes(item.graph, item.fetch); - EXPECT_EQ(1, tensors_expected.size()); + ConstantFolding optimizer(nullptr /* cpu_device */); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); @@ -717,9 +716,6 @@ TEST_F(ConstantFoldingTest, ControlDependencies) { } } EXPECT_EQ(1, found); - auto tensors = EvaluateNodes(output, item.fetch); - EXPECT_EQ(1, tensors.size()); - test::ExpectTensorEqual(tensors_expected[0], tensors[0]); } TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) { @@ -995,6 +991,18 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) { } } EXPECT_EQ(3, found); + + auto v1_t = GenerateRandomTensor(TensorShape({3})); + auto v2_t = GenerateRandomTensor(TensorShape({5, 7})); + auto v3_t = GenerateRandomTensor(TensorShape({11, 13})); + std::vector fetch_nodes = {"p2"}; + auto tensors_expected = EvaluateNodes( + item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}}); + EXPECT_EQ(1, tensors_expected.size()); + auto tensors = EvaluateNodes(output, fetch_nodes, + {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}}); + EXPECT_EQ(1, tensors.size()); + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); } TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) { @@ -1192,6 +1200,30 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) { } } EXPECT_EQ(4, found); + + auto v_in_t = GenerateRandomTensor(TensorShape({3})); + Tensor v_ctrl_t(DT_BOOL, TensorShape({})); + + v_ctrl_t.flat()(0) = true; + std::vector fetch_nodes = {"m", "m2"}; + auto tensors_expected = EvaluateNodes( + item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors_expected.size()); + auto tensors = EvaluateNodes(output, fetch_nodes, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors.size()); + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); + test::ExpectTensorNear(tensors_expected[1], tensors[1], 1e-5); + + v_ctrl_t.flat()(0) = false; + tensors_expected = EvaluateNodes(item.graph, fetch_nodes, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors_expected.size()); + tensors = EvaluateNodes(output, fetch_nodes, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors.size()); + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); + test::ExpectTensorNear(tensors_expected[1], tensors[1], 1e-5); } TEST_F(ConstantFoldingTest, SwitchNodes) { @@ -1268,6 +1300,16 @@ TEST_F(ConstantFoldingTest, SwitchNodes) { EXPECT_EQ(2, tensors.size()); test::ExpectTensorEqual(tensors_expected[0], tensors[0]); test::ExpectTensorNear(tensors_expected[1], tensors[1], 1e-5); + + v_ctrl_t.flat()(0) = false; + tensors_expected = EvaluateNodes(item.graph, item.fetch, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors_expected.size()); + tensors = EvaluateNodes(output, item.fetch, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, tensors.size()); + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); + test::ExpectTensorNear(tensors_expected[1], tensors[1], 1e-5); } TEST_F(ConstantFoldingTest, MergeNodes) { From 9b496c9134529f6d85f0e9757099104cf506cbd6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 11:21:21 -0700 Subject: [PATCH 0437/1734] Update ops-related pbtxt files. PiperOrigin-RevId: 193546050 --- tensorflow/core/ops/compat/ops_history.v1.pbtxt | 15 +++++++++++++++ tensorflow/core/ops/ops.pbtxt | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 9bc11cf0fe2..dbd6f859c46 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -15829,6 +15829,21 @@ op { minimum: 1 } } +op { + name: "DatasetToTFRecord" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "filename" + type: DT_STRING + } + input_arg { + name: "compression_type" + type: DT_STRING + } +} op { name: "DebugGradientIdentity" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 9b665190ce0..46afe357f06 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -7051,6 +7051,21 @@ op { minimum: 1 } } +op { + name: "DatasetToTFRecord" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "filename" + type: DT_STRING + } + input_arg { + name: "compression_type" + type: DT_STRING + } +} op { name: "DebugGradientIdentity" input_arg { From 87229e4fc3bc23c7a92bfdf40e5834ac65a00d34 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 11:47:28 -0700 Subject: [PATCH 0438/1734] Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 193550428 --- tensorflow/go/op/wrappers.go | 72 ++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index 35ad1eff0fc..3b3dff0573a 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -3105,6 +3105,42 @@ func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) return op.Output(0) } +// Return a tensor with the same shape and contents as the input tensor or value. +func Identity(scope *Scope, input tf.Output) (output tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "Identity", + Input: []tf.Input{ + input, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + +// Computes arctangent of `y/x` element-wise, respecting signs of the arguments. +// +// This is the angle \( \theta \in [-\pi, \pi] \) such that +// \[ x = r \cos(\theta) \] +// and +// \[ y = r \sin(\theta) \] +// where \(r = \sqrt(x^2 + y^2) \). +func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "Atan2", + Input: []tf.Input{ + y, x, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Creates a dataset that passes a sliding window over `input_dataset`. // // Arguments: @@ -25383,42 +25419,6 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional .. return op.Output(0) } -// Computes arctangent of `y/x` element-wise, respecting signs of the arguments. -// -// This is the angle \( \theta \in [-\pi, \pi] \) such that -// \[ x = r \cos(\theta) \] -// and -// \[ y = r \sin(\theta) \] -// where \(r = \sqrt(x^2 + y^2) \). -func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "Atan2", - Input: []tf.Input{ - y, x, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// Return a tensor with the same shape and contents as the input tensor or value. -func Identity(scope *Scope, input tf.Output) (output tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "Identity", - Input: []tf.Input{ - input, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Gather slices from `params` axis `axis` according to `indices`. // // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D). From 78db5136edf30667090988c703f98f4f8c4c4269 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 19 Apr 2018 11:52:10 -0700 Subject: [PATCH 0439/1734] Implements linear_model using _LinearModel. Added support for cols_to_vars in _LinearModel in order to make this possible. Also, made some fixes so that variable names come out the same as before. PiperOrigin-RevId: 193551353 --- .../python/feature_column/feature_column.py | 106 ++++++++-------- .../feature_column/feature_column_test.py | 117 ++++++++++++------ .../training/warm_starting_util_test.py | 16 +-- 3 files changed, 138 insertions(+), 101 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 0ad8131599a..87a52f84415 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -409,58 +409,19 @@ def linear_model(features, ValueError: if an item in `feature_columns` is neither a `_DenseColumn` nor `_CategoricalColumn`. """ - feature_columns = _clean_feature_columns(feature_columns) - for column in feature_columns: - if not isinstance(column, (_DenseColumn, _CategoricalColumn)): - raise ValueError('Items of feature_columns must be either a _DenseColumn ' - 'or _CategoricalColumn. Given: {}'.format(column)) - weight_collections = list(weight_collections or []) - if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: - weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) - if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: - weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) - with variable_scope.variable_scope( - None, default_name='linear_model', values=features.values()): - weighted_sums = [] - ordered_columns = [] - builder = _LazyBuilder(features) - for column in sorted(feature_columns, key=lambda x: x.name): - with variable_scope.variable_scope( - None, default_name=column._var_scope_name): # pylint: disable=protected-access - ordered_columns.append(column) - weighted_sum = _create_weighted_sum( - column=column, - builder=builder, - units=units, - sparse_combiner=sparse_combiner, - weight_collections=weight_collections, - trainable=trainable) - weighted_sums.append(weighted_sum) - if cols_to_vars is not None: - # Retrieve the variables created. - cols_to_vars[column] = ops.get_collection( - ops.GraphKeys.GLOBAL_VARIABLES, - scope=variable_scope.get_variable_scope().name) - _verify_static_batch_size_equality(weighted_sums, ordered_columns) - predictions_no_bias = math_ops.add_n( - weighted_sums, name='weighted_sum_no_bias') - bias = variable_scope.get_variable( - 'bias_weights', - shape=[units], - initializer=init_ops.zeros_initializer(), - trainable=trainable, - collections=weight_collections) - predictions = nn_ops.bias_add( - predictions_no_bias, bias, name='weighted_sum') - if cols_to_vars is not None: - # Add the bias to cols_to_vars as well, converting the Variable or - # PartitionedVariable to a list of Variable's. - if (isinstance(bias, variables.Variable) or - resource_variable_ops.is_resource_variable(bias)): - cols_to_vars['bias'] = [bias] - else: # Must be a PartitionedVariable. - cols_to_vars['bias'] = list(bias) - return predictions + linear_model_layer = _LinearModel( + feature_columns=feature_columns, + units=units, + sparse_combiner=sparse_combiner, + weight_collections=weight_collections, + trainable=trainable, + name='linear_model') + retval = linear_model_layer(features) # pylint: disable=not-callable + if cols_to_vars is None: + return retval + for k, v in linear_model_layer.cols_to_vars().items(): + cols_to_vars[k] = v + return retval def _add_to_collections(var, weight_collections): @@ -551,8 +512,22 @@ class _BiasLayer(base.Layer): return self._bias_variable +def _get_expanded_variable_list(variable): + if (isinstance(variable, variables.Variable) or + resource_variable_ops.is_resource_variable(variable)): + return [variable] # Single variable case. + else: # Must be a PartitionedVariable, so convert into a list. + return list(variable) + + +def _strip_leading_slashes(name): + return name.rsplit('/', 1)[-1] + + class _LinearModel(training.Model): """Creates a linear model using feature columns. + + See `linear_model` for details. """ def __init__(self, @@ -573,7 +548,10 @@ class _LinearModel(training.Model): for column in sorted(self._feature_columns, key=lambda x: x.name): with variable_scope.variable_scope( None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access - column_name = vs.name + # Having the fully expressed variable scope name ends up doubly + # expressing the outer scope (scope with which this method was called) + # in the name of the variable that would get created. + column_name = _strip_leading_slashes(vs.name) column_layer = _FCLinearWrapper(column, units, sparse_combiner, self._weight_collections, trainable, column_name, **kwargs) @@ -585,6 +563,15 @@ class _LinearModel(training.Model): weight_collections=self._weight_collections, name='bias_layer', **kwargs) + self._cols_to_vars = {} + + def cols_to_vars(self): + """Returns a dict mapping _FeatureColumns to variables. + + See `linear_model` for more information. + This is not populated till `call` is called i.e. layer is built. + """ + return self._cols_to_vars def call(self, features): with variable_scope.variable_scope(self.name): @@ -597,15 +584,24 @@ class _LinearModel(training.Model): ordered_columns = [] builder = _LazyBuilder(features) for layer in sorted(self._column_layers.values(), key=lambda x: x.name): - ordered_columns.append(layer._feature_column) # pylint: disable=protected-access + column = layer._feature_column # pylint: disable=protected-access + ordered_columns.append(column) weighted_sum = layer(builder) weighted_sums.append(weighted_sum) + self._cols_to_vars[column] = ops.get_collection( + ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name) _verify_static_batch_size_equality(weighted_sums, ordered_columns) predictions_no_bias = math_ops.add_n( weighted_sums, name='weighted_sum_no_bias') predictions = nn_ops.bias_add( - predictions_no_bias, self._bias_layer(builder), name='weighted_sum') # pylint: disable=not-callable + predictions_no_bias, + self._bias_layer( # pylint: disable=not-callable + builder, + scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable + name='weighted_sum') + bias = self._bias_layer.variables[0] + self._cols_to_vars['bias'] = _get_expanded_variable_list(bias) return predictions def _add_layers(self, layers): diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py index 46404abadca..49e06b82453 100644 --- a/tensorflow/python/feature_column/feature_column_test.py +++ b/tensorflow/python/feature_column/feature_column_test.py @@ -345,7 +345,7 @@ class NumericColumnTest(test.TestCase): with ops.Graph().as_default(): features = {'price': [[1.], [5.]]} predictions = get_keras_linear_model_predictions(features, [price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -584,7 +584,7 @@ class BucketizedColumnTest(test.TestCase): features = {'price': [[-1.], [1.], [5.], [6.]]} predictions = get_keras_linear_model_predictions(features, [bucketized_price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() bucketized_price_var = get_linear_model_column_var(bucketized_price) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -610,7 +610,7 @@ class BucketizedColumnTest(test.TestCase): features = {'price': [[-1., 1.], [5., 6.]]} predictions = get_keras_linear_model_predictions(features, [bucketized_price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() bucketized_price_var = get_linear_model_column_var(bucketized_price) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -849,7 +849,7 @@ class HashedCategoricalColumnTest(test.TestCase): values=('marlo', 'skywalker', 'omar'), dense_shape=(2, 2)) }, (wire_column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_var = get_linear_model_column_var(wire_column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) @@ -1171,7 +1171,7 @@ class CrossedColumnTest(test.TestCase): values=['cA', 'cB', 'cC'], dense_shape=(2, 2)), }, (crossed,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() crossed_var = get_linear_model_column_var(crossed) with _initialized_session() as sess: self.assertAllClose((0.,), bias.eval()) @@ -1254,18 +1254,13 @@ def get_linear_model_column_var(column): 'linear_model/' + column.name)[0] -def get_keras_linear_model_bias(): - with variable_scope.variable_scope('linear_model', reuse=True): - with variable_scope.variable_scope('bias_layer', reuse=True): - return variable_scope.get_variable('bias_weights') - - def get_keras_linear_model_predictions(features, feature_columns, units=1, sparse_combiner='sum', weight_collections=None, - trainable=True): + trainable=True, + cols_to_vars=None): keras_linear_model = _LinearModel( feature_columns, units, @@ -1273,7 +1268,12 @@ def get_keras_linear_model_predictions(features, weight_collections, trainable, name='linear_model') - return keras_linear_model(features) # pylint: disable=not-callable + retval = keras_linear_model(features) # pylint: disable=not-callable + if cols_to_vars is None: + return retval + for k, v in keras_linear_model.cols_to_vars().items(): + cols_to_vars[k] = v + return retval @test_util.with_c_api @@ -1977,7 +1977,7 @@ class _LinearModelTest(test.TestCase): with ops.Graph().as_default(): features = {'price': [[1.], [5.]]} predictions = get_keras_linear_model_predictions(features, [price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -1994,7 +1994,7 @@ class _LinearModelTest(test.TestCase): dense_shape=[2, 2]) features = {'wire_cast': wire_tensor} predictions = get_keras_linear_model_predictions(features, [wire_cast]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -2014,7 +2014,7 @@ class _LinearModelTest(test.TestCase): features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]} predictions = get_keras_linear_model_predictions(features, [wire_cast, price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) price_var = get_linear_model_column_var(price) with _initialized_session() as sess: @@ -2072,7 +2072,7 @@ class _LinearModelTest(test.TestCase): features = {dense_and_sparse_column.name: sp_tensor} predictions = get_keras_linear_model_predictions( features, [dense_and_sparse_column]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() dense_and_sparse_column_var = get_linear_model_column_var( dense_and_sparse_column) with _initialized_session() as sess: @@ -2088,7 +2088,7 @@ class _LinearModelTest(test.TestCase): features = {'price': [[1.], [5.]]} predictions = get_keras_linear_model_predictions( features, [price], units=3) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) with _initialized_session() as sess: self.assertAllClose(np.zeros((3,)), bias.eval()) @@ -2108,7 +2108,7 @@ class _LinearModelTest(test.TestCase): features = {'wire_cast': wire_tensor} predictions = get_keras_linear_model_predictions( features, [wire_cast], units=3) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) with _initialized_session() as sess: self.assertAllClose(np.zeros((3,)), bias.eval()) @@ -2163,7 +2163,7 @@ class _LinearModelTest(test.TestCase): features = {'wire_cast': wire_tensor} predictions = get_keras_linear_model_predictions( features, [wire_cast], sparse_combiner='mean') - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) with _initialized_session() as sess: sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) @@ -2176,7 +2176,7 @@ class _LinearModelTest(test.TestCase): features = {'price': [[1., 2.], [5., 6.]]} predictions = get_keras_linear_model_predictions( features, [price], units=3) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) with _initialized_session() as sess: self.assertAllClose(np.zeros((3,)), bias.eval()) @@ -2206,7 +2206,7 @@ class _LinearModelTest(test.TestCase): with ops.Graph().as_default(): features = {'price': [[[1., 2.]], [[5., 6.]]]} predictions = get_keras_linear_model_predictions(features, [price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) with _initialized_session() as sess: self.assertAllClose([0.], bias.eval()) @@ -2222,7 +2222,7 @@ class _LinearModelTest(test.TestCase): features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} predictions = get_keras_linear_model_predictions(features, [price1, price2]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price1_var = get_linear_model_column_var(price1) price2_var = get_linear_model_column_var(price2) with _initialized_session() as sess: @@ -2235,6 +2235,45 @@ class _LinearModelTest(test.TestCase): sess.run(bias.assign([7.])) self.assertAllClose([[3217.], [4657.]], predictions.eval()) + def test_fills_cols_to_vars(self): + price1 = fc.numeric_column('price1', shape=2) + price2 = fc.numeric_column('price2') + with ops.Graph().as_default(): + features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]} + cols_to_vars = {} + get_keras_linear_model_predictions( + features, [price1, price2], cols_to_vars=cols_to_vars) + bias = get_linear_model_bias() + price1_var = get_linear_model_column_var(price1) + price2_var = get_linear_model_column_var(price2) + self.assertAllEqual(cols_to_vars['bias'], [bias]) + self.assertAllEqual(cols_to_vars[price1], [price1_var]) + self.assertAllEqual(cols_to_vars[price2], [price2_var]) + + def test_fills_cols_to_vars_partitioned_variables(self): + price1 = fc.numeric_column('price1', shape=2) + price2 = fc.numeric_column('price2', shape=3) + with ops.Graph().as_default(): + features = { + 'price1': [[1., 2.], [6., 7.]], + 'price2': [[3., 4., 5.], [8., 9., 10.]] + } + cols_to_vars = {} + with variable_scope.variable_scope( + 'linear', + partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)): + get_keras_linear_model_predictions( + features, [price1, price2], cols_to_vars=cols_to_vars) + with _initialized_session(): + self.assertEqual([0.], cols_to_vars['bias'][0].eval()) + # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables. + self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval()) + # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and + # a [1, 1] Variable. + self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval()) + self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval()) + def test_dense_collection(self): price = fc.numeric_column('price') with ops.Graph().as_default() as g: @@ -2242,7 +2281,7 @@ class _LinearModelTest(test.TestCase): get_keras_linear_model_predictions( features, [price], weight_collections=['my-vars']) my_vars = g.get_collection('my-vars') - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) self.assertIn(bias, my_vars) self.assertIn(price_var, my_vars) @@ -2256,7 +2295,7 @@ class _LinearModelTest(test.TestCase): get_keras_linear_model_predictions( features, [wire_cast], weight_collections=['my-vars']) my_vars = g.get_collection('my-vars') - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) self.assertIn(bias, my_vars) self.assertIn(wire_cast_var, my_vars) @@ -2266,7 +2305,7 @@ class _LinearModelTest(test.TestCase): with ops.Graph().as_default() as g: features = {'price': [[1.], [5.]]} get_keras_linear_model_predictions(features, [price]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_var = get_linear_model_column_var(price) trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) self.assertIn(bias, trainable_vars) @@ -2280,7 +2319,7 @@ class _LinearModelTest(test.TestCase): features = {'wire_cast': wire_tensor} get_keras_linear_model_predictions(features, [wire_cast]) trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) self.assertIn(bias, trainable_vars) self.assertIn(wire_cast_var, trainable_vars) @@ -2427,7 +2466,7 @@ class _LinearModelTest(test.TestCase): coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(sess, coord=coord) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_buckets_var = get_linear_model_column_var(price_buckets) body_style_var = get_linear_model_column_var(body_style) @@ -2470,7 +2509,7 @@ class _LinearModelTest(test.TestCase): net = get_keras_linear_model_predictions(features, [price_buckets, body_style]) with _initialized_session() as sess: - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_buckets_var = get_linear_model_column_var(price_buckets) body_style_var = get_linear_model_column_var(body_style) @@ -2509,7 +2548,7 @@ class _LinearModelTest(test.TestCase): net = get_keras_linear_model_predictions( features, [price_buckets, body_style, country]) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() price_buckets_var = get_linear_model_column_var(price_buckets) body_style_var = get_linear_model_column_var(body_style) with _initialized_session() as sess: @@ -3688,7 +3727,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase): values=('marlo', 'skywalker', 'omar'), dense_shape=(2, 2)) }, (wire_column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_var = get_linear_model_column_var(wire_column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) @@ -4080,7 +4119,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase): values=('marlo', 'skywalker', 'omar'), dense_shape=(2, 2)) }, (wire_column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() wire_var = get_linear_model_column_var(wire_column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) @@ -4326,7 +4365,7 @@ class IdentityCategoricalColumnTest(test.TestCase): values=(0, 2, 1), dense_shape=(2, 2)) }, (column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() weight_var = get_linear_model_column_var(column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) @@ -5108,7 +5147,7 @@ class EmbeddingColumnTest(test.TestCase): categorical_column.name: sparse_input }, (embedding_column,)) expected_var_names = ( - 'linear_model/bias_layer/bias_weights:0', + 'linear_model/bias_weights:0', 'linear_model/aaa_embedding/weights:0', 'linear_model/aaa_embedding/embedding_weights:0', ) @@ -5120,7 +5159,7 @@ class EmbeddingColumnTest(test.TestCase): for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) } self.assertItemsEqual(expected_var_names, trainable_vars.keys()) - bias = trainable_vars['linear_model/bias_layer/bias_weights:0'] + bias = trainable_vars['linear_model/bias_weights:0'] embedding_weights = trainable_vars[ 'linear_model/aaa_embedding/embedding_weights:0'] linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0'] @@ -5757,7 +5796,7 @@ class SharedEmbeddingColumnTest(test.TestCase): # Linear weights do not follow the column name. But this is a rare use # case, and fixing it would add too much complexity to the code. expected_var_names = ( - 'linear_model/bias_layer/bias_weights:0', + 'linear_model/bias_weights:0', 'linear_model/aaa_bbb_shared_embedding/weights:0', 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0', 'linear_model/aaa_bbb_shared_embedding_1/weights:0', @@ -5770,7 +5809,7 @@ class SharedEmbeddingColumnTest(test.TestCase): for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) } self.assertItemsEqual(expected_var_names, trainable_vars.keys()) - bias = trainable_vars['linear_model/bias_layer/bias_weights:0'] + bias = trainable_vars['linear_model/bias_weights:0'] embedding_weights = trainable_vars[ 'linear_model/aaa_bbb_shared_embedding/embedding_weights:0'] linear_weights_a = trainable_vars[ @@ -6105,7 +6144,7 @@ class WeightedCategoricalColumnTest(test.TestCase): values=(.5, 1., .1), dense_shape=(2, 2)) }, (column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() weight_var = get_linear_model_column_var(column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) @@ -6172,7 +6211,7 @@ class WeightedCategoricalColumnTest(test.TestCase): dense_shape=(2, 2)), 'values': ((.5,), (1.,), (.1,)) }, (column,)) - bias = get_keras_linear_model_bias() + bias = get_linear_model_bias() weight_var = get_linear_model_column_var(column) with _initialized_session(): self.assertAllClose((0.,), bias.eval()) diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py index 6e445d8bd14..7e8cbd6baee 100644 --- a/tensorflow/python/training/warm_starting_util_test.py +++ b/tensorflow/python/training/warm_starting_util_test.py @@ -946,18 +946,20 @@ class WarmStartingUtilTest(test.TestCase): # emb_vocab should be correctly warm-started after vocab remapping. # Missing values are filled in with the EmbeddingColumn's initializer. self._assert_cols_to_vars( - cols_to_vars, { + cols_to_vars, + { emb_vocab: [ - # embedding_weights part 0. - np.array([[3., 3.3], [2., 2.2], [1., 1.1]]), - # embedding_weights part 1. - np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]), # linear weights part 0. np.array([[0.69]]), # linear weights part 1. - np.array([[0.71]]) + np.array([[0.71]]), + # embedding_weights part 0. + np.array([[3., 3.3], [2., 2.2], [1., 1.1]]), + # embedding_weights part 1. + np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]) ] - }, sess) + }, + sess) def testErrorConditions(self): x = variable_scope.get_variable( From 173aadc6b62dd95691257c2d9f158dd9044bb4ef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 11:55:46 -0700 Subject: [PATCH 0440/1734] Change estimator to only log non-binary eval metrics, because logging binary metrics such as images will lead to crash. PiperOrigin-RevId: 193551927 --- tensorflow/python/estimator/estimator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index a42b6cfee85..9862fdecdb2 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -1256,7 +1256,8 @@ def _dict_to_str(dictionary): A `str` representing the `dictionary`. """ return ', '.join('%s = %s' % (k, v) - for k, v in sorted(six.iteritems(dictionary))) + for k, v in sorted(six.iteritems(dictionary)) + if not isinstance(v, six.binary_type)) def _write_dict_to_summary(output_dir, From fb02b02689b0e126c93cbcb8462e8417e1d954cc Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 19 Apr 2018 11:57:36 -0700 Subject: [PATCH 0441/1734] Avoid looking up the shape functions multiple times Improved the handling of fed nodes PiperOrigin-RevId: 193552210 --- .../core/grappler/costs/graph_properties.cc | 155 +++++++++--------- .../core/grappler/costs/graph_properties.h | 7 - 2 files changed, 78 insertions(+), 84 deletions(-) diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index c83ddfe90a0..dd2d53dfdfb 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -395,8 +395,11 @@ class TopoQueue { // unknown shape/dimension of a given node. class SymbolicShapeRefiner { public: - explicit SymbolicShapeRefiner(const GraphDef& graph) - : function_library_(OpRegistry::Global(), graph.library()) { + explicit SymbolicShapeRefiner( + const GraphDef& graph, + const std::unordered_map>& fed_ports) + : function_library_(OpRegistry::Global(), graph.library()), + fed_ports_(fed_ports) { graph_def_version_ = graph.versions().producer(); node_to_context_.reserve(graph.node_size()); } @@ -704,6 +707,9 @@ class SymbolicShapeRefiner { std::vector input_tensors_as_shapes; NodeContext& node_ctx = node_to_context_[node]; + TF_RETURN_IF_ERROR( + function_library_.LookUp(node->type_string(), &node_ctx.op_data)); + node_ctx.inference_context.reset(new InferenceContext( graph_def_version_, &node->def(), node->op_def(), input_shapes, input_tensors, input_tensors_as_shapes, @@ -716,6 +722,7 @@ class SymbolicShapeRefiner { } struct NodeContext { + const OpRegistrationData* op_data; std::unique_ptr inference_context; std::vector output_tensors_as_shapes; }; @@ -723,65 +730,80 @@ class SymbolicShapeRefiner { Status InferShapes(const Node* node, NodeContext* c) { InferenceContext* ic = c->inference_context.get(); - // Propagate shape tensors - if (node->type_string() == "Shape") { - c->output_tensors_as_shapes.resize(1); - c->output_tensors_as_shapes[0] = c->inference_context->input(0); - } else if (node->type_string() == "ShapeN") { - c->output_tensors_as_shapes.resize(c->inference_context->num_inputs()); - for (int i = 0; i < c->inference_context->num_inputs(); ++i) { - c->output_tensors_as_shapes[i] = c->inference_context->input(i); - } - } else if (node->type_string() == "ConcatV2") { - bool valid = true; - ShapeHandle result; - for (int i = 0; i < ic->num_inputs() - 1; ++i) { - ShapeHandle input = ic->input_tensors_as_shapes()[i]; - if (!ic->RankKnown(input)) { - valid = false; - break; - } else if (i == 0) { - result = input; - } else { - TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result)); + auto it = fed_ports_.find(node->name()); + const bool is_fed = it != fed_ports_.end(); + + // Propagate shape tensors unless the node is fed. + // TODO(bsteiner) We should still propagate the shapes to the ports that + // aren't fed in the case of a ShapeN node. + if (!is_fed) { + if (node->type_string() == "Shape") { + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = c->inference_context->input(0); + } else if (node->type_string() == "ShapeN") { + c->output_tensors_as_shapes.resize(c->inference_context->num_inputs()); + for (int i = 0; i < c->inference_context->num_inputs(); ++i) { + c->output_tensors_as_shapes[i] = c->inference_context->input(i); } - } - if (valid) { - c->output_tensors_as_shapes.resize(1); - c->output_tensors_as_shapes[0] = result; - } - } else if (node->type_string() == "Slice") { - ShapeHandle input = ic->input_tensors_as_shapes()[0]; - bool valid = ic->RankKnown(input); - const Tensor* slice_offset = ic->input_tensor(1); - valid &= slice_offset != nullptr && slice_offset->NumElements() == 1; - const Tensor* slice_size = ic->input_tensor(2); - valid &= slice_size != nullptr && slice_size->NumElements() == 1; - if (valid) { - int64 start = slice_offset->dtype() == DT_INT32 - ? slice_offset->flat()(0) - : slice_offset->flat()(0); - int64 end = start + (slice_size->dtype() == DT_INT32 - ? slice_size->flat()(0) - : slice_size->flat()(0)); + } else if (node->type_string() == "ConcatV2") { + bool valid = true; ShapeHandle result; - TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result)); - c->output_tensors_as_shapes.resize(1); - c->output_tensors_as_shapes[0] = result; + for (int i = 0; i < ic->num_inputs() - 1; ++i) { + ShapeHandle input = ic->input_tensors_as_shapes()[i]; + if (!ic->RankKnown(input)) { + valid = false; + break; + } else if (i == 0) { + result = input; + } else { + TF_RETURN_IF_ERROR(ic->Concatenate(result, input, &result)); + } + } + if (valid) { + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = result; + } + } else if (node->type_string() == "Slice") { + ShapeHandle input = ic->input_tensors_as_shapes()[0]; + bool valid = ic->RankKnown(input); + const Tensor* slice_offset = ic->input_tensor(1); + valid &= slice_offset != nullptr && slice_offset->NumElements() == 1; + const Tensor* slice_size = ic->input_tensor(2); + valid &= slice_size != nullptr && slice_size->NumElements() == 1; + if (valid) { + int64 start = slice_offset->dtype() == DT_INT32 + ? slice_offset->flat()(0) + : slice_offset->flat()(0); + int64 end = start + (slice_size->dtype() == DT_INT32 + ? slice_size->flat()(0) + : slice_size->flat()(0)); + ShapeHandle result; + TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result)); + c->output_tensors_as_shapes.resize(1); + c->output_tensors_as_shapes[0] = result; + } } } // Infer the shapes of output tensors. - const OpRegistrationData* op_reg_data; - Status s = function_library_.default_registry()->LookUp(node->type_string(), - &op_reg_data); - if (!s.ok() || op_reg_data->shape_inference_fn == nullptr) { + if (!c->op_data || c->op_data->shape_inference_fn == nullptr) { // There is nothing more we can infer, annotate outputs with unknown // shapes return c->inference_context->Run(shape_inference::UnknownShape); } - return c->inference_context->Run(op_reg_data->shape_inference_fn); + TF_RETURN_IF_ERROR( + c->inference_context->Run(c->op_data->shape_inference_fn)); + + Status status = Status::OK(); + if (is_fed) { + // It is possible to feed node output ports with tensors of any shape: as + // a result, the shape of a fed port is completely unknown. + for (const int output_port : it->second) { + status.Update(SetUnknownShape(node, output_port)); + } + } + return status; } NodeContext* GetNodeContext(const Node* node) { @@ -797,6 +819,7 @@ class SymbolicShapeRefiner { std::unordered_map unknown_shapes_; std::unordered_map unknown_dims_; FunctionLibraryDefinition function_library_; + const std::unordered_map>& fed_ports_; }; // Keep track of shapes and dimensions in a graph. @@ -983,23 +1006,6 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, return Status::OK(); } -Status GraphProperties::OverwriteFedPorts( - SymbolicShapeRefiner* shape_refiner, - const std::unordered_map>& fed_ports, - const Node* node, bool* new_shapes) const { - auto it = fed_ports.find(node->name()); - Status status; - if (it != fed_ports.end()) { - // It is possible to feed node output ports with tensors of any shape: as a - // result, the shape of a fed port is completely unknown. - for (const int output_port : it->second) { - status.Update(shape_refiner->SetUnknownShape(node, output_port)); - } - *new_shapes = true; - } - return status; -} - // Manually propagate the input shape for Enter nodes and update any Merge node // outputs. Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner, @@ -1032,7 +1038,6 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner, Status GraphProperties::UpdateShapes( SymbolicShapeRefiner* shape_refiner, bool relax, - const std::unordered_map>& fed_ports, const Node* n, bool* new_shapes) const { if (n->IsEnter()) { // The Enter shape function always forwards an UnknownShape, so do the right @@ -1053,9 +1058,7 @@ Status GraphProperties::UpdateShapes( } } } - // Nodes can be fed with any shape. The TensorFlow shape inference code can't - // handle this properly, so overwrite its behavior here. - return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes); + return Status::OK(); } // Propagates the shapes in the transitive fan-out of . @@ -1063,7 +1066,6 @@ Status GraphProperties::PropagateShapes( SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes, const std::unordered_map>& resources, - const std::unordered_map>& fed_ports, int num_loops) const { // Limit the number of iterations to prevent infinite loops in the presence of // incorrect shape functions. The algoritm should converge in at most @@ -1087,8 +1089,7 @@ Status GraphProperties::PropagateShapes( num_loop_iterations++ < max_loop_iterations) { const Node* n = new_shapes->pop(); bool updated = false; - TF_RETURN_IF_ERROR( - UpdateShapes(shape_refiner, relax, fed_ports, n, &updated)); + TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated)); if (updated) { for (const Edge* e : n->out_edges()) { if (!e->IsControlEdge()) { @@ -1243,7 +1244,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { } } - SymbolicShapeRefiner refiner(item_.graph); + SymbolicShapeRefiner refiner(item_.graph, fed_ports); // We propagate shapes through the graph in two phases. In the first phase, we // exclusively merge shapes but we do not propagate shapes through the @@ -1267,8 +1268,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { new_shapes.push(node); } // Propagate shapes normally. - TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources, - fed_ports, num_loops)); + TF_RETURN_IF_ERROR( + PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops)); } // Track shapes globally across the graph. diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h index 30351f58fd2..4c3f3f5f533 100644 --- a/tensorflow/core/grappler/costs/graph_properties.h +++ b/tensorflow/core/grappler/costs/graph_properties.h @@ -102,16 +102,10 @@ class GraphProperties { // Process the Enter node, and enqueue its fanout in new_shapes if needed. static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner, const Node* node, bool relax, bool* new_shapes); - // Process a node that is used to feed the model. - Status OverwriteFedPorts( - SymbolicShapeRefiner* shape_refiner, - const std::unordered_map>& fed_ports, - const Node* node, bool* new_shapes) const; // Update the shapes for node 'n'. If output shapes for n have changed, // enqueue its fanout in 'new_shapes'. Status UpdateShapes( SymbolicShapeRefiner* shape_refiner, bool relax, - const std::unordered_map>& fed_ports, const Node* n, bool* new_shapes) const; // Propagate the shapes for the nodes enqueued in new_shapes and their // transitive fanout until a fixed point is reached. @@ -119,7 +113,6 @@ class GraphProperties { SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes, const std::unordered_map>& resources, - const std::unordered_map>& fed_ports, int num_loops) const; // Data members From 0ea0049fa500078c132ed29b60beb8831de26dbb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 11:57:48 -0700 Subject: [PATCH 0442/1734] Internal cleanup. PiperOrigin-RevId: 193552240 --- .../java/org/tensorflow/lite/DataType.java | 12 ++- .../java/org/tensorflow/lite/Interpreter.java | 19 +++-- .../lite/NativeInterpreterWrapper.java | 21 +++--- .../main/java/org/tensorflow/lite/Tensor.java | 7 +- .../java/src/main/native/exception_jni.cc | 3 +- .../native/nativeinterpreterwrapper_jni.cc | 74 +++++++++++-------- .../lite/java/src/main/native/tensor_jni.cc | 35 +++++---- .../lite/NativeInterpreterWrapperTest.java | 6 +- 8 files changed, 102 insertions(+), 75 deletions(-) diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java index fc16488a645..75334cd96e8 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java @@ -51,7 +51,11 @@ enum DataType { } } throw new IllegalArgumentException( - "DataType " + c + " is not recognized in Java (version " + TensorFlowLite.version() + ")"); + "DataType error: DataType " + + c + + " is not recognized in Java (version " + + TensorFlowLite.version() + + ")"); } /** Returns byte size of the type. */ @@ -68,7 +72,8 @@ enum DataType { case BYTEBUFFER: return 1; } - throw new IllegalArgumentException("DataType " + this + " is not supported yet"); + throw new IllegalArgumentException( + "DataType error: DataType " + this + " is not supported yet"); } /** Gets string names of the data type. */ @@ -85,7 +90,8 @@ enum DataType { case BYTEBUFFER: return "ByteBuffer"; } - throw new IllegalArgumentException("DataType " + this + " is not supported yet"); + throw new IllegalArgumentException( + "DataType error: DataType " + this + " is not supported yet"); } // Cached to avoid copying it diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java index a33959dca49..e915e65aa13 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java @@ -137,17 +137,19 @@ public final class Interpreter implements AutoCloseable { public void runForMultipleInputsOutputs( @NonNull Object[] inputs, @NonNull Map outputs) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } Tensor[] tensors = wrapper.run(inputs); if (outputs == null || tensors == null || outputs.size() > tensors.length) { - throw new IllegalArgumentException("Outputs do not match with model outputs."); + throw new IllegalArgumentException("Output error: Outputs do not match with model outputs."); } final int size = tensors.length; for (Integer idx : outputs.keySet()) { if (idx == null || idx < 0 || idx >= size) { throw new IllegalArgumentException( - String.format("Invalid index of output %d (should be in range [0, %d))", idx, size)); + String.format( + "Output error: Invalid index of output %d (should be in range [0, %d))", + idx, size)); } tensors[idx].copyTo(outputs.get(idx)); } @@ -160,7 +162,7 @@ public final class Interpreter implements AutoCloseable { */ public void resizeInput(int idx, @NonNull int[] dims) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } wrapper.resizeInput(idx, dims); } @@ -173,7 +175,7 @@ public final class Interpreter implements AutoCloseable { */ public int getInputIndex(String opName) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } return wrapper.getInputIndex(opName); } @@ -186,7 +188,7 @@ public final class Interpreter implements AutoCloseable { */ public int getOutputIndex(String opName) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } return wrapper.getOutputIndex(opName); } @@ -198,7 +200,7 @@ public final class Interpreter implements AutoCloseable { */ public Long getLastNativeInferenceDurationNanoseconds() { if (wrapper == null) { - throw new IllegalStateException("The interpreter has already been closed."); + throw new IllegalStateException("Internal error: The interpreter has already been closed."); } return wrapper.getLastNativeInferenceDurationNanoseconds(); } @@ -208,7 +210,8 @@ public final class Interpreter implements AutoCloseable { if (wrapper != null) { wrapper.setUseNNAPI(useNNAPI); } else { - throw new IllegalStateException("NativeInterpreterWrapper has already been closed."); + throw new IllegalStateException( + "Internal error: NativeInterpreterWrapper has already been closed."); } } diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java index fc8187acfeb..dfc8ac111a2 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java @@ -80,7 +80,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { /** Sets inputs, runs model inference and returns outputs. */ Tensor[] run(Object[] inputs) { if (inputs == null || inputs.length == 0) { - throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty."); + throw new IllegalArgumentException("Input error: Inputs should not be null or empty."); } int[] dataTypes = new int[inputs.length]; Object[] sizes = new Object[inputs.length]; @@ -92,7 +92,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { ByteBuffer buffer = (ByteBuffer) inputs[i]; if (buffer.order() != ByteOrder.nativeOrder()) { throw new IllegalArgumentException( - "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder()."); + "Input error: ByteBuffer shoud use ByteOrder.nativeOrder()."); } numsOfBytes[i] = buffer.limit(); sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]); @@ -103,7 +103,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%d-th element of the %d inputs is not an array or a ByteBuffer.", + "Input error: %d-th element of the %d inputs is not an array or a ByteBuffer.", i, inputs.length)); } } @@ -119,7 +119,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { this, isMemoryAllocated); if (outputsHandles == null || outputsHandles.length == 0) { - throw new IllegalStateException("Interpreter has no outputs."); + throw new IllegalStateException("Internal error: Interpreter has no outputs."); } isMemoryAllocated = true; Tensor[] outputs = new Tensor[outputsHandles.length]; @@ -169,7 +169,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%s is not a valid name for any input. The indexes of the inputs are %s", + "Input error: %s is not a valid name for any input. " + + "The indexes of the inputs are %s", name, inputsIndexes.toString())); } } @@ -190,7 +191,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%s is not a valid name for any output. The indexes of the outputs are %s", + "Input error: %s is not a valid name for any output. " + + "The indexes of the outputs are %s", name, outputsIndexes.toString())); } } @@ -229,7 +231,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { return DataType.BYTEBUFFER; } } - throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName()); + throw new IllegalArgumentException( + "DataType error: cannot resolve DataType of " + o.getClass().getName()); } /** Returns the shape of an object as an int array. */ @@ -245,7 +248,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { return 0; } if (Array.getLength(o) == 0) { - throw new IllegalArgumentException("array lengths cannot be 0."); + throw new IllegalArgumentException("Array lengths cannot be 0."); } return 1 + numDimensions(Array.get(o, 0)); } @@ -259,7 +262,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { shape[dim] = len; } else if (shape[dim] != len) { throw new IllegalArgumentException( - String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim)); + String.format("Mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim)); } for (int i = 0; i < len; ++i) { fillShape(Array.get(o, i), dim + 1, shape); diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java index 54ace6c63ce..09e887aae33 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java @@ -34,15 +34,16 @@ final class Tensor { if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) { throw new IllegalArgumentException( String.format( - "Cannot convert an TensorFlowLite tensor with type %s to a Java object of " - + "type %s (which is compatible with the TensorFlowLite type %s)", + "Output error: Cannot convert an TensorFlowLite tensor with type %s to a Java " + + "object of type %s (which is compatible with the TensorFlowLite type %s)", dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst))); } int[] dstShape = NativeInterpreterWrapper.shapeOf(dst); if (!Arrays.equals(dstShape, shapeCopy)) { throw new IllegalArgumentException( String.format( - "Shape of output target %s does not match with the shape of the Tensor %s.", + "Output error: Shape of output target %s does not match with the shape of the " + + "Tensor %s.", Arrays.toString(dstShape), Arrays.toString(shapeCopy))); } readMultiDimensionalArray(nativeHandle, dst); diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc index 1578c9e3ddd..34d91be04cd 100644 --- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc @@ -44,7 +44,8 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) { buffer_ = new char[limit]; if (!buffer_) { throwException(env, kNullPointerException, - "Malloc of BufferErrorReporter to hold %d char failed.", + "Internal error: Malloc of BufferErrorReporter to hold %d " + "char failed.", limit); return; } diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc index 844226203bb..ccfdfd829b4 100644 --- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc @@ -22,7 +22,7 @@ const int kBufferSize = 256; tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to Interpreter."); + "Internal error: Invalid handle to Interpreter."); return nullptr; } return reinterpret_cast(handle); @@ -30,7 +30,8 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) { tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) { if (handle == 0) { - throwException(env, kIllegalArgumentException, "Invalid handle to model."); + throwException(env, kIllegalArgumentException, + "Internal error: Invalid handle to model."); return nullptr; } return reinterpret_cast(handle); @@ -39,7 +40,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) { BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to ErrorReporter."); + "Internal error: Invalid handle to ErrorReporter."); return nullptr; } return reinterpret_cast(handle); @@ -51,7 +52,7 @@ std::vector convertJIntArrayToVector(JNIEnv* env, jintArray inputs) { jint* ptr = env->GetIntArrayElements(inputs, nullptr); if (ptr == nullptr) { throwException(env, kIllegalArgumentException, - "Empty dimensions of input array."); + "Array has empty dimensions."); return {}; } for (int i = 0; i < size; ++i) { @@ -113,7 +114,7 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, jobjectArray sizes) { if (input_size != interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Expected num of inputs is %d but got %d", + "Input error: Expected num of inputs is %d but got %d", interpreter->inputs().size(), input_size); return kTfLiteError; } @@ -121,8 +122,9 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, input_size != env->GetArrayLength(nums_of_bytes) || input_size != env->GetArrayLength(values)) { throwException(env, kIllegalArgumentException, - "Arrays in arguments should be of the same length, but got " - "%d sizes, %d data_types, %d nums_of_bytes, and %d values", + "Internal error: Arrays in arguments should be of the same " + "length, but got %d sizes, %d data_types, %d nums_of_bytes, " + "and %d values", input_size, env->GetArrayLength(data_types), env->GetArrayLength(nums_of_bytes), env->GetArrayLength(values)); @@ -136,8 +138,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, int num_dims = static_cast(env->GetArrayLength(dims)); if (target->dims->size != num_dims) { throwException(env, kIllegalArgumentException, - "%d-th input should have %d dimensions, but found %d " - "dimensions", + "Input error: %d-th input should have %d dimensions, but " + "found %d dimensions", i, target->dims->size, num_dims); return kTfLiteError; } @@ -150,7 +152,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, num_dims); printDims(obtained_dims.get(), kBufferSize, ptr, num_dims); throwException(env, kIllegalArgumentException, - "%d-th input dimension should be [%s], but found [%s]", + "Input error: %d-th input dimension should be [%s], but " + "found [%s]", i, expected_dims.get(), obtained_dims.get()); env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT); return kTfLiteError; @@ -236,8 +239,8 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter, TfLiteType type = resolveDataType(data_type[i]); if (type != target->type) { throwException(env, kIllegalArgumentException, - "DataType (%d) of input data does not match with the " - "DataType (%d) of model inputs.", + "Input error: DataType (%d) of input data does not " + "match with the DataType (%d) of model inputs.", type, target->type); return kTfLiteError; } @@ -270,7 +273,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env, jclass string_class = env->FindClass("java/lang/String"); if (string_class == nullptr) { throwException(env, kUnsupportedOperationException, - "Can not find java/lang/String class to get input names."); + "Internal error: Can not find java/lang/String class to get " + "input names."); return nullptr; } size_t size = interpreter->inputs().size(); @@ -292,7 +296,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env, jclass string_class = env->FindClass("java/lang/String"); if (string_class == nullptr) { throwException(env, kUnsupportedOperationException, - "Can not find java/lang/String class to get output names."); + "Internal error: Can not find java/lang/String class to get " + "output names."); return nullptr; } size_t size = interpreter->outputs().size(); @@ -351,8 +356,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel( path, verifier.get(), error_reporter); if (!model) { throwException(env, kIllegalArgumentException, - "Contents of %s does not encode a valid TensorFlowLite " - "model: %s", + "Contents of %s does not encode a valid " + "TensorFlowLite model: %s", path, error_reporter->CachedErrorMessage()); env->ReleaseStringUTFChars(model_file, path); return 0; @@ -380,8 +385,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer( buf, static_cast(capacity), error_reporter); if (!model) { throwException(env, kIllegalArgumentException, - "MappedByteBuffer does not encode a valid TensorFlowLite " - "model: %s", + "MappedByteBuffer does not encode a valid " + "TensorFlowLite model: %s", error_reporter->CachedErrorMessage()); return 0; } @@ -403,7 +408,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter( &interpreter, static_cast(num_threads)); if (status != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Cannot create interpreter: %s", + "Internal error: Cannot create interpreter: %s", error_reporter->CachedErrorMessage()); return 0; } @@ -411,7 +416,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter( status = interpreter->AllocateTensors(); if (status != kTfLiteOk) { throwException(env, kNullPointerException, - "Can not allocate memory for the interpreter", + "Internal error: Cannot allocate memory for the interpreter", error_reporter->CachedErrorMessage()); return 0; } @@ -440,7 +445,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // resizes inputs status = resizeInputs(env, interpreter, input_size, sizes); if (status != kTfLiteOk) { - throwException(env, kNullPointerException, "Can not resize the input: %s", + throwException(env, kNullPointerException, + "Internal error: Can not resize the input: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -448,7 +454,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( status = interpreter->AllocateTensors(); if (status != kTfLiteOk) { throwException(env, kNullPointerException, - "Can not allocate memory for the given inputs: %s", + "Internal error: Can not allocate memory for the given " + "inputs: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -461,7 +468,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // runs inference if (interpreter->Invoke() != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Failed to run on the given Interpreter: %s", + "Internal error: Failed to run on the given Interpreter: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -479,8 +486,9 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // returns outputs const std::vector& results = interpreter->outputs(); if (results.empty()) { - throwException(env, kIllegalArgumentException, - "The Interpreter does not have any outputs."); + throwException( + env, kIllegalArgumentException, + "Internal error: The Interpreter does not have any outputs."); return nullptr; } jlongArray outputs = env->NewLongArray(results.size()); @@ -501,7 +509,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims( const int idx = static_cast(input_idx); if (input_idx < 0 || input_idx >= interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Out of range: Failed to get %d-th input out of %d inputs", + "Input error: Out of range: Failed to get %d-th input out of" + " %d inputs", input_idx, interpreter->inputs().size()); return nullptr; } @@ -514,8 +523,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims( } if (num_bytes != expected_num_bytes) { throwException(env, kIllegalArgumentException, - "Failed to get input dimensions. %d-th input should have" - " %d bytes, but found %d bytes.", + "Input error: Failed to get input dimensions. %d-th input " + "should have %d bytes, but found %d bytes.", idx, expected_num_bytes, num_bytes); return nullptr; } @@ -533,8 +542,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType( const int idx = static_cast(output_idx); if (output_idx < 0 || output_idx >= interpreter->outputs().size()) { throwException(env, kIllegalArgumentException, - "Out of range: Failed to get %d-th output out of %d outputs", - output_idx, interpreter->outputs().size()); + "Failed to get %d-th output out of %d outputs", output_idx, + interpreter->outputs().size()); return -1; } TfLiteTensor* target = interpreter->tensor(interpreter->outputs()[idx]); @@ -555,7 +564,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput( const int idx = static_cast(input_idx); if (idx < 0 || idx >= interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Can not resize %d-th input for a model having %d inputs.", + "Input error: Can not resize %d-th input for a model having " + "%d inputs.", idx, interpreter->inputs().size()); return JNI_FALSE; } @@ -567,7 +577,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput( interpreter->inputs()[idx], convertJIntArrayToVector(env, dims)); if (status != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Failed to resize %d-th input: %s", idx, + "Internal error: Failed to resize %d-th input: %s", idx, error_reporter->CachedErrorMessage()); return JNI_FALSE; } diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc index 65126e78a30..17f4be09c63 100644 --- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc @@ -23,7 +23,7 @@ namespace { TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to TfLiteTensor."); + "Internal error: Invalid handle to TfLiteTensor."); return nullptr; } return reinterpret_cast(handle); @@ -36,7 +36,8 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type, size_t to_copy = num_elements * elementByteSize(type); if (to_copy > dst_size) { throwException(env, kIllegalStateException, - "cannot write Java array of %d bytes to Tensor of %d bytes", + "Internal error: cannot write Java array of %d bytes to " + "Tensor of %d bytes", to_copy, dst_size); return 0; } @@ -71,10 +72,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type, } default: { throwException(env, kUnsupportedOperationException, - "TensorFlowLite currently supports float (32 bits), " - "int (32 bits), byte (8 bits), and long (64 bits), " - "support for other types (DataType %d in this case) will " - "be added in the future", + "DataType error: TensorFlowLite currently supports float " + "(32 bits), int (32 bits), byte (8 bits), and long " + "(64 bits), support for other types (DataType %d in this " + "case) will be added in the future", kTfLiteFloat32, type); return 0; } @@ -88,8 +89,9 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type, if (size > src_size) { throwException( env, kIllegalStateException, - "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size, - src_size); + "Internal error: cannot fill a Java array of %d bytes with a Tensor of " + "%d bytes", + size, src_size); return 0; } switch (data_type) { @@ -117,8 +119,8 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type, return size; } default: { - throwException(env, kIllegalStateException, "invalid DataType(%d)", - data_type); + throwException(env, kIllegalStateException, + "DataType error: invalid DataType(%d)", data_type); } } return 0; @@ -152,19 +154,22 @@ size_t elementByteSize(TfLiteType data_type) { switch (data_type) { case kTfLiteFloat32: static_assert(sizeof(jfloat) == 4, - "Java float not compatible with kTfLiteFloat"); + "Interal error: Java float not compatible with " + "kTfLiteFloat"); return 4; case kTfLiteInt32: static_assert(sizeof(jint) == 4, - "Java int not compatible with kTfLiteInt"); + "Interal error: Java int not compatible with kTfLiteInt"); return 4; case kTfLiteUInt8: static_assert(sizeof(jbyte) == 1, - "Java byte not compatible with kTfLiteUInt8"); + "Interal error: Java byte not compatible with " + "kTfLiteUInt8"); return 1; case kTfLiteInt64: static_assert(sizeof(jlong) == 8, - "Java long not compatible with kTfLiteInt64"); + "Interal error: Java long not compatible with " + "kTfLiteInt64"); return 8; default: return 0; @@ -212,7 +217,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env, int num_dims = tensor->dims->size; if (num_dims == 0) { throwException(env, kIllegalArgumentException, - "copyTo() is not meant for scalar Tensors."); + "Internal error: Cannot copy empty/scalar Tensors."); return; } readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes, diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java index dbe45e5a05b..7c00d3196fd 100644 --- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java +++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java @@ -321,9 +321,7 @@ public final class NativeInterpreterWrapperTest { wrapper.run(inputs); fail(); } catch (IllegalArgumentException e) { - assertThat(e) - .hasMessageThat() - .contains("Invalid inputs. Inputs should not be null or empty."); + assertThat(e).hasMessageThat().contains("Inputs should not be null or empty."); } wrapper.close(); } @@ -440,7 +438,7 @@ public final class NativeInterpreterWrapperTest { NativeInterpreterWrapper.numDimensions(emptyArray); fail(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageThat().contains("array lengths cannot be 0."); + assertThat(e).hasMessageThat().contains("Array lengths cannot be 0."); } } From 16d25e8c8a9ebb6500d3b3418ca8c2bb80c3e42e Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Thu, 19 Apr 2018 11:58:04 -0700 Subject: [PATCH 0443/1734] Add support for Dataset Iterators in Model training/eval methods in graph mode. PiperOrigin-RevId: 193552275 --- tensorflow/python/keras/BUILD | 1 + .../keras/_impl/keras/engine/training.py | 195 ++++++++++++------ .../_impl/keras/engine/training_arrays.py | 12 +- .../keras/_impl/keras/engine/training_test.py | 84 +++++++- .../api/golden/tensorflow.keras.-model.pbtxt | 4 +- .../golden/tensorflow.keras.-sequential.pbtxt | 4 +- .../tensorflow.keras.models.-model.pbtxt | 4 +- .../tensorflow.keras.models.-sequential.pbtxt | 4 +- 8 files changed, 223 insertions(+), 85 deletions(-) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index ca7686b1d1d..70040b7e740 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -175,6 +175,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":backend", + "//tensorflow/python/data", "@six_archive//:six", ], ) diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py index 012d9ceea43..146e8fdac9a 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training.py +++ b/tensorflow/python/keras/_impl/keras/engine/training.py @@ -20,6 +20,8 @@ from __future__ import print_function import numpy as np +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.ops import iterator_ops from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util @@ -634,12 +636,20 @@ class Model(Network): This is a purely internal method, subject to refactoring at any time. Args: - x: An array or list of arrays, to be used as input data. If the model - has known, named inputs, this could also be a dict mapping input names - to the corresponding array. - y: An array or list of arrays, to be used as target data. If the model - has known, named outputs, this could also be a dict mapping output names - to the corresponding array. + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset iterator. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset iterator, + `y` should not be specified + (since targets will be obtained from the iterator). sample_weight: An optional sample-weight array passed by the user to weight the importance of each sample in `x`. class_weight: An optional class-weight array by the user to @@ -659,6 +669,31 @@ class Model(Network): RuntimeError: If the model was never compiled. """ # First, we build/compile the model on the fly if necessary. + if isinstance(x, dataset_ops.Dataset): + raise ValueError('You passed a `Dataset` instance to your model (%s), ' + 'which is not supported. Instead, pass an `Iterator`, ' + 'which you can obtain e.g. via ' + '`dataset.make_one_shot_iterator()` (the exact method ' + 'to use will depend on your specific dataset).' % x) + if isinstance(x, iterator_ops.Iterator): + if y is not None: + raise ValueError('You passed a dataset iterator (%s) as input `x` to ' + 'your model. In that case, you should not specify ' + 'a target (`y`) argument, since the dataset iterator ' + 'generates both input data and target data. ' + 'Received: %s' % (x, y)) + if not context.executing_eagerly(): + x, y = x.get_next() + # TODO(fchollet): handle case of `get_next` not returning 2 tensors? + else: + # TODO(psv): implement this. The way to support it will be to typecheck + # for `iterator` before `_standardize_user_data` is called and redirect + # to new training/eval functions in `training_eager.py`. The model + # may need to get built using the specs of the data from the first batch + # drawn from the iterator. + raise ValueError('Dataset iterators are not supported ' + 'with eager execution yet.') + all_inputs = [] if not self.built: # We need to use `x` to set the model inputs. @@ -1016,22 +1051,26 @@ class Model(Network): """Trains the model for a fixed number of epochs (iterations on a dataset). Arguments: - x: Numpy array of training data (if the model has a single input), - or list of Numpy arrays (if the model has multiple inputs). - If input layers in the model are named, you can also pass a - dictionary mapping input names to Numpy arrays. - `x` can be `None` (default) if feeding from - TensorFlow data tensors. - y: Numpy array of target (label) data - (if the model has a single output), - or list of Numpy arrays (if the model has multiple outputs). - If output layers in the model are named, you can also pass a - dictionary mapping output names to Numpy arrays. - `y` can be `None` (default) if feeding from - TensorFlow data tensors. + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset iterator. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset iterator, + `y` should not be specified + (since targets will be obtained from the iterator). batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. + Do not specify the `batch_size` is your data is in the + form of symbolic tensors or dataset iterators (since they generate + batches). epochs: Integer. Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. @@ -1053,11 +1092,14 @@ class Model(Network): on this data at the end of each epoch. The validation data is selected from the last samples in the `x` and `y` data provided, before shuffling. - validation_data: tuple `(x_val, y_val)` or tuple - `(x_val, y_val, val_sample_weights)` on which to evaluate + validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. `validation_data` will override `validation_split`. + `validation_data` could be: + - tuple `(x_val, y_val)` of Numpy arrays or tensors + - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays + - dataset iterator shuffle: Boolean (whether to shuffle the training data before each epoch) or str (for 'batch'). 'batch' is a special option for dealing with the @@ -1134,17 +1176,22 @@ class Model(Network): batch_size=batch_size) # Prepare validation data. if validation_data: - if len(validation_data) == 2: + if isinstance(validation_data, iterator_ops.Iterator): + val_x = validation_data + val_y = None + val_sample_weight = None + elif len(validation_data) == 2: val_x, val_y = validation_data # pylint: disable=unpacking-non-sequence val_sample_weight = None elif len(validation_data) == 3: val_x, val_y, val_sample_weight = validation_data # pylint: disable=unpacking-non-sequence else: raise ValueError( - 'When passing validation_data, ' - 'it must contain 2 (x_val, y_val) ' - 'or 3 (x_val, y_val, val_sample_weights) ' - 'items, however it contains %d items' % len(validation_data)) + 'When passing a `validation_data` argument, ' + 'it must contain either 2 items (x_val, y_val), ' + 'or 3 items (x_val, y_val, val_sample_weights), ' + 'or alternatively it could be a dataset iterator. However we ' + 'received `validation_data=%s`' % validation_data) val_x, val_y, val_sample_weights = self._standardize_user_data( val_x, @@ -1218,22 +1265,26 @@ class Model(Network): Computation is done in batches. Arguments: - x: Numpy array of test data (if the model has a single input), - or list of Numpy arrays (if the model has multiple inputs). - If input layers in the model are named, you can also pass a - dictionary mapping input names to Numpy arrays. - `x` can be `None` (default) if feeding from - TensorFlow data tensors. - y: Numpy array of target (label) data - (if the model has a single output), - or list of Numpy arrays (if the model has multiple outputs). - If output layers in the model are named, you can also pass a - dictionary mapping output names to Numpy arrays. - `y` can be `None` (default) if feeding from - TensorFlow data tensors. + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset iterator. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset iterator, + `y` should not be specified + (since targets will be obtained from the iterator). batch_size: Integer or `None`. - Number of samples per evaluation step. + Number of samples per gradient update. If unspecified, `batch_size` will default to 32. + Do not specify the `batch_size` is your data is in the + form of symbolic tensors or dataset iterators (since they generate + batches). verbose: 0 or 1. Verbosity mode. 0 = silent, 1 = progress bar. sample_weight: Optional Numpy array of weights for @@ -1291,9 +1342,13 @@ class Model(Network): Computation is done in batches. Arguments: - x: The input data, as a Numpy array - (or list of Numpy arrays if the model has multiple outputs). - batch_size: Integer. If unspecified, it will default to 32. + x: Input samples, as Numpy array(s) or tensor(s). + batch_size: Integer or `None`. + Number of samples per gradient update. + If unspecified, `batch_size` will default to 32. + Do not specify the `batch_size` is your data is in the + form of symbolic tensors or dataset iterators (since they generate + batches). verbose: Verbosity mode, 0 or 1. steps: Total number of steps (batches of samples) before declaring the prediction round finished. @@ -1324,20 +1379,24 @@ class Model(Network): return training_arrays.predict_loop( self, x, batch_size=batch_size, verbose=verbose, steps=steps) - def train_on_batch(self, x, y, sample_weight=None, class_weight=None): + def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None): """Runs a single gradient update on a single batch of data. Arguments: - x: Numpy array of training data, - or list of Numpy arrays if the model has multiple inputs. - If all inputs in the model are named, - you can also pass a dictionary - mapping input names to Numpy arrays. - y: Numpy array of target data, - or list of Numpy arrays if the model has multiple outputs. - If all outputs in the model are named, - you can also pass a dictionary - mapping output names to Numpy arrays. + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset iterator. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset iterator, + `y` should not be specified + (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array @@ -1384,20 +1443,24 @@ class Model(Network): return outputs[0] return outputs - def test_on_batch(self, x, y, sample_weight=None): + def test_on_batch(self, x, y=None, sample_weight=None): """Test the model on a single batch of samples. Arguments: - x: Numpy array of test data, - or list of Numpy arrays if the model has multiple inputs. - If all inputs in the model are named, - you can also pass a dictionary - mapping input names to Numpy arrays. - y: Numpy array of target data, - or list of Numpy arrays if the model has multiple outputs. - If all outputs in the model are named, - you can also pass a dictionary - mapping output names to Numpy arrays. + x: Input data. It could be: + - A Numpy array (or array-like), or a list of arrays + (in case the model has multiple inputs). + - A TensorFlow tensor, or a list of tensors + (in case the model has multiple inputs). + - A dict mapping input names to the corresponding array/tensors, + if the model has named inputs. + - A `tf.data` dataset iterator. + y: Target data. Like the input data `x`, + it could be either Numpy array(s) or TensorFlow tensor(s). + It should be consistent with `x` (you cannot have Numpy inputs and + tensor targets, or inversely). If `x` is a dataset iterator, + `y` should not be specified + (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array @@ -1437,7 +1500,7 @@ class Model(Network): """Returns predictions for a single batch of samples. Arguments: - x: Input samples, as a Numpy array. + x: Input samples, as Numpy array(s) or tensor(s). Returns: Numpy array(s) of predictions. diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py index 18116e3a14d..4164cae864c 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_arrays.py @@ -23,6 +23,7 @@ import copy import numpy as np +from tensorflow.python.framework import errors from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import callbacks as cbks from tensorflow.python.keras._impl.keras.engine import training_utils @@ -30,6 +31,7 @@ from tensorflow.python.keras._impl.keras.engine.base_layer import Layer from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays +from tensorflow.python.platform import tf_logging as logging try: from scipy.sparse import issparse # pylint: disable=g-import-not-at-top @@ -190,7 +192,15 @@ def fit_loop(model, batch_logs['batch'] = step_index batch_logs['size'] = 1 callbacks.on_batch_begin(step_index, batch_logs) - outs = f(ins) + try: + outs = f(ins) + except errors.OutOfRangeError: + logging.warning('Your dataset iterator ran out of data; ' + 'interrupting training. Make sure that your dataset ' + 'can generate at least `steps_per_epoch * epochs` ' + 'batches (in this case, %d batches).' % + steps_per_epoch * epochs) + break if not isinstance(outs, list): outs = [outs] diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py index d9281436dee..58011a14126 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py @@ -23,6 +23,7 @@ import unittest import numpy as np +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import test_util as tf_test_util from tensorflow.python.keras._impl import keras @@ -31,9 +32,9 @@ from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_m from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays from tensorflow.python.ops import array_ops from tensorflow.python.platform import test +from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training.rmsprop import RMSPropOptimizer - try: import scipy.sparse as scipy_sparse # pylint: disable=g-import-not-at-top except ImportError: @@ -1711,14 +1712,77 @@ class TestTrainingWithDataTensors(test.TestCase): 'dropout_acc'] self.assertEqual(reference_metric_names, model.metrics_names) -if __name__ == '__main__': - # Bazel sets these environment variables to very long paths. - # Tempfile uses them to create long paths, and in turn multiprocessing - # library tries to create sockets named after paths. Delete whatever bazel - # writes to these to avoid tests failing due to socket addresses being too - # long. - for var in ('TMPDIR', 'TMP', 'TEMP'): - if var in os.environ: - del os.environ[var] +class TestTrainingWithDatasetIterators(test.TestCase): + + def test_training_and_eval_methods_on_iterators_single_io(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = 'rmsprop' + loss = 'mse' + metrics = ['mae'] + model.compile(optimizer, loss, metrics=metrics) + + inputs = np.zeros((10, 3)) + targets = np.zeros((10, 4)) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + iterator = dataset.make_one_shot_iterator() + + model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0) + model.evaluate(iterator, steps=2, verbose=0) + model.predict(iterator, steps=2) + model.train_on_batch(iterator) + model.test_on_batch(iterator) + # Test with validation data + model.fit(iterator, + epochs=1, steps_per_epoch=2, verbose=0, + validation_data=iterator, validation_steps=2) + # Test with validation split + with self.assertRaisesRegexp(ValueError, + 'you cannot use `validation_split`'): + model.fit(iterator, + epochs=1, steps_per_epoch=2, verbose=0, + validation_split=0.5, validation_steps=2) + + # Test invalid usage + with self.assertRaisesRegexp(ValueError, + 'Instead, pass an `Iterator`'): + model.fit(dataset, + epochs=1, steps_per_epoch=2, verbose=0) + with self.assertRaisesRegexp(ValueError, + 'you should not specify a target'): + model.fit(iterator, iterator, + epochs=1, steps_per_epoch=2, verbose=0) + + def test_iterators_running_out_of_data(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = 'rmsprop' + loss = 'mse' + metrics = ['mae'] + model.compile(optimizer, loss, metrics=metrics) + + inputs = np.zeros((10, 3)) + targets = np.zeros((10, 4)) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(2) + dataset = dataset.batch(10) + iterator = dataset.make_one_shot_iterator() + + with test.mock.patch.object(logging, 'warning') as mock_log: + model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0) + self.assertRegexpMatches( + str(mock_log.call_args), + 'dataset iterator ran out of data') + + +if __name__ == '__main__': test.main() diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt index 7713d78b8a5..cdf2da712f3 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt @@ -251,7 +251,7 @@ tf_class { } member_method { name: "test_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "to_json" @@ -263,6 +263,6 @@ tf_class { } member_method { name: "train_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } } diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt index 69b81f75fa0..5c2c29e60fe 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt @@ -268,7 +268,7 @@ tf_class { } member_method { name: "test_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "to_json" @@ -280,6 +280,6 @@ tf_class { } member_method { name: "train_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } } diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt index 3ac285681f5..b3f3f169227 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt @@ -251,7 +251,7 @@ tf_class { } member_method { name: "test_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "to_json" @@ -263,6 +263,6 @@ tf_class { } member_method { name: "train_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } } diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt index 51ba0c5043f..4ac6811bace 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt @@ -268,7 +268,7 @@ tf_class { } member_method { name: "test_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "to_json" @@ -280,6 +280,6 @@ tf_class { } member_method { name: "train_on_batch" - argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } } From a186c4c093fce7e3fcc8cd59ca0e968324311f09 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 12:32:52 -0700 Subject: [PATCH 0444/1734] Fix bug in ring_reducer.cc abort handling. PiperOrigin-RevId: 193557334 --- .../core/common_runtime/ring_reducer.cc | 20 ++++++++++--------- .../core/common_runtime/ring_reducer_test.cc | 12 +++++------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc index 79d03a24ced..a1cd7625051 100644 --- a/tensorflow/core/common_runtime/ring_reducer.cc +++ b/tensorflow/core/common_runtime/ring_reducer.cc @@ -426,17 +426,20 @@ bool RingReducer::RunAsyncParts() { // is done. bool dispatched = false; // true if async action was initiated do { - if (aborted) break; + if (aborted) { + // Requeue this RingField to be counted off below. + ready_queue.Enqueue(rf); + break; + } switch (rf->action) { case RF_INIT: if (rf->do_recv) { rf->action = RF_RECV; auto requeue = [this, rf, &ready_queue, &aborted](Status s) { - if (!s.ok()) { - aborted = true; - StartAbort(s); - } + const bool bad_status = !s.ok(); + if (bad_status) aborted = true; ready_queue.Enqueue(rf); + if (bad_status) StartAbort(s); }; DispatchRecv(rf, requeue); dispatched = true; @@ -481,11 +484,10 @@ bool RingReducer::RunAsyncParts() { if (rf->do_send) { rf->action = RF_SEND; auto send_complete = [this, rf, &ready_queue, &aborted](Status s) { - if (!s.ok()) { - aborted = true; - StartAbort(s); - } + const bool bad_status = !s.ok(); + if (bad_status) aborted = true; ready_queue.Enqueue(rf); + if (bad_status) StartAbort(s); }; DispatchSend(rf, send_complete); dispatched = true; diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc index 57c36d6582c..e4387a074af 100644 --- a/tensorflow/core/common_runtime/ring_reducer_test.cc +++ b/tensorflow/core/common_runtime/ring_reducer_test.cc @@ -572,9 +572,9 @@ DEF_TEST(INT32, CPU, 2, 8, 3, 4095, 0) DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0) DEF_TEST(INT64, CPU, 2, 8, 3, 4095, 0) -// // Failure tests -// DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7) -// DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11) +// Failure tests +DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7) +DEF_TEST(FLOAT, CPU, 2, 8, 2, 9408, 11) #endif #ifdef GOOGLE_CUDA @@ -597,9 +597,9 @@ DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0) // DEF_TEST(INT32, GPU, 1, 2, 1, 1001, 0) DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0) -// // Failure tests -// DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2) -// DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5) +// Failure tests +DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2) +DEF_TEST(FLOAT, GPU, 1, 8, 2, 9408, 5) #endif } // namespace From 46aec0d27f5d6fb3a0b81bc5a3384da11273dad6 Mon Sep 17 00:00:00 2001 From: Sung Jin Hwang Date: Thu, 19 Apr 2018 12:44:21 -0700 Subject: [PATCH 0445/1734] Make PmfToQuantizedCdf op to make adjustments if the sum of quantized pmf is less than 2**precision. Prior to the change, the op did nothing when the sum of quantized pmf was less than 2**precision. While the produced CDF was valid for range coders, adjustments to CDF could be made to achieve better compression rate. PiperOrigin-RevId: 193558740 --- .../contrib/coder/kernels/pmf_to_cdf_op.cc | 60 ++++++++++++++++--- .../coder/kernels/pmf_to_cdf_op_test.cc | 6 +- tensorflow/contrib/coder/ops/coder_ops.cc | 16 +++-- 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc index c787e8edede..bd5272ee6f2 100644 --- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc +++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc @@ -16,6 +16,7 @@ limitations under the License. #define EIGEN_USE_THREADS #include +#include #include #include #include @@ -79,8 +80,8 @@ class PmfToCdfOp : public OpKernel { } private: - struct Item { - Item(int32* p, double mass) : pointer(p), mass(mass) { + struct PenaltyItem { + PenaltyItem(int32* p, double mass) : pointer(p), mass(mass) { penalty = ComputeNextPenalty(); } @@ -90,7 +91,7 @@ class PmfToCdfOp : public OpKernel { penalty = ComputeNextPenalty(); } - friend bool operator<(const Item& lhs, const Item& rhs) { + friend bool operator<(const PenaltyItem& lhs, const PenaltyItem& rhs) { return lhs.penalty < rhs.penalty; } @@ -106,6 +107,34 @@ class PmfToCdfOp : public OpKernel { double penalty; }; + struct GainItem { + GainItem(int32* p, double mass) : pointer(p), mass(mass) { + gain = ComputeNextGain(); + } + + void Increase() { + CHECK_GT(*pointer, 0); + ++*pointer; + gain = ComputeNextGain(); + } + + friend bool operator>(const GainItem& lhs, const GainItem& rhs) { + return lhs.gain > rhs.gain; + } + + double ComputeNextGain() { + // Never increment zero value to non-zero value. + if (*pointer < 1) { + return -std::numeric_limits::infinity(); + } + return mass * (std::log2(*pointer + 1) - std::log2(*pointer)); + } + + int32* pointer; + double mass; + double gain; + }; + void PerShard(gtl::ArraySlice pmf, gtl::MutableArraySlice cdf) const { CHECK_EQ(pmf.size(), cdf.size()); @@ -121,7 +150,7 @@ class PmfToCdfOp : public OpKernel { int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0); if (sum > normalizer) { - std::vector queue; + std::vector queue; queue.reserve(cdf.size()); for (int i = 0; i < cdf.size(); ++i) { queue.emplace_back(&cdf[i], pmf[i]); @@ -132,9 +161,26 @@ class PmfToCdfOp : public OpKernel { queue[0].Decrease(); // Performs a linear search because this find_if is likely to return // iterator very close to the begin. - auto iter = - std::find_if(std::next(queue.begin()), queue.end(), - [&queue](const Item& rhs) { return queue[0] < rhs; }); + auto iter = std::find_if( + std::next(queue.begin()), queue.end(), + [&queue](const PenaltyItem& rhs) { return queue[0] < rhs; }); + std::rotate(queue.begin(), std::next(queue.begin()), iter); + } + } else if (sum < normalizer) { + std::vector queue; + queue.reserve(cdf.size()); + for (int i = 0; i < cdf.size(); ++i) { + queue.emplace_back(&cdf[i], pmf[i]); + } + + std::sort(queue.begin(), queue.end(), std::greater()); + while (sum++ < normalizer) { + queue[0].Increase(); + // Performs a linear search because this find_if is likely to return + // iterator very close to the begin. + auto iter = std::find_if( + std::next(queue.begin()), queue.end(), + [&queue](const GainItem& rhs) { return queue[0] > rhs; }); std::rotate(queue.begin(), std::next(queue.begin()), iter); } } diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc index c70e38faab7..3408f6b519a 100644 --- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc +++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc @@ -82,7 +82,7 @@ class PmfToQuantizedCdfOpTest : public OpsTestBase { EXPECT_GT(diff, 0); } - EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer); + EXPECT_EQ(cdf_slice(cdf_slice.size() - 1), normalizer); } } }; @@ -98,6 +98,8 @@ TEST_F(PmfToQuantizedCdfOpTest, UnderSum) { GenerateData(&rand, {&matrix(i, 0), n}); } + pmf.flat() = pmf.flat() * 0.85f; + constexpr int kPrecision = 10; SetupOp(kPrecision, &pmf); TF_ASSERT_OK(RunOpKernel()); @@ -115,7 +117,7 @@ TEST_F(PmfToQuantizedCdfOpTest, OverSum) { matrix.setZero(); const std::size_t n = matrix.dimension(1) / 2; - random::PhiloxRandom gen; + random::PhiloxRandom gen(random::New64(), random::New64()); random::SimplePhilox rand(&gen); for (int64 i = 0; i < matrix.dimension(0); ++i) { GenerateData(&rand, {&matrix(i, 0), n}); diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc index 9bb171298f8..a185e07913f 100644 --- a/tensorflow/contrib/coder/ops/coder_ops.cc +++ b/tensorflow/contrib/coder/ops/coder_ops.cc @@ -77,7 +77,7 @@ are incorrect. For this reason, the range coder uses integer arithmetics and avoids using any floating point operations internally, and `cdf` should contain integers representing quantized probability mass rather than floating points. -data: An int32 tensor. +data: An int16 tensor. cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided by `2^precision` to represent a fraction. encoded: A range-coded scalar string. @@ -112,7 +112,7 @@ potential performance issues, the decoder does not return error status. encoded: A scalar string tensor from RangeEncode. shape: An int32 1-D tensor representing the shape of the data encoded by RangeEncode. -decoded: An int32 tensor with shape equal to `shape`. +decoded: An int16 tensor with shape equal to `shape`. precision: The number of bits for probability quantization. Must be <= 16, and must match the precision used by RangeEncode that produced `encoded`. )doc"); @@ -138,14 +138,12 @@ platforms. For entropy encoders and decoders to have the same quantized CDF on different platforms, the quantized CDF should be produced once and saved, then the saved quantized CDF should be used everywhere. -After quantization, if PMF sums to less than or equal to 2^precision, then this -is equivalent to cumsum over the last dimension. This op makes no effort to make -the sum close to 2^precision when the sum is already <= 2^precision. +After quantization, if PMF does not sum to 2^precision, then some values of PMF +are increased or decreased to adjust the sum to equal to 2^precision. -After quantization, if PMF sums to greater than 2^precision, then some values of -PMF is decreased to keep the sum no more than 2^precision. - -Note that the input PMF is pre-quantization. +Note that the input PMF is pre-quantization. The input PMF is not normalized +by this op prior to quantization. Therefore the user is responsible for +normalizing PMF if necessary. )doc"); // clang-format on } // namespace tensorflow From b3118b1f741896585d47184018f1d74d70e0e6c7 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 19 Apr 2018 13:08:37 -0700 Subject: [PATCH 0446/1734] Update adam.py --- tensorflow/contrib/optimizer_v2/adam.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 76a867039af..d538ad0fb02 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,19 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$ - $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$ - $$t \Leftarrow 0 (Initialize timestep)$$ + $$m_0 := 0 (Initialize initial 1st moment vector)$$ + $$v_0 := 0 (Initialize initial 2nd moment vector)$$ + $$t := 0 (Initialize timestep)$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - $$t \Leftarrow t + 1$$ - $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$t := t + 1$$ + $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$ - $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ - $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a From 58f6760373b7a2d71053bd17b8017e57e5d1195d Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Thu, 19 Apr 2018 13:09:24 -0700 Subject: [PATCH 0447/1734] Update api_def_ApplyAdam.pbtxt --- tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index fca8ba25306..b90f5473c89 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Thu, 19 Apr 2018 13:09:59 -0700 Subject: [PATCH 0448/1734] Update api_def_ResourceApplyAdam.pbtxt --- .../core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt index 8b16d824bf9..743247bb60c 100644 --- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt @@ -76,8 +76,8 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: < Date: Thu, 19 Apr 2018 13:11:04 -0700 Subject: [PATCH 0449/1734] Update adam.py --- tensorflow/python/training/adam.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 9f523a3aca2..6fa3ff66583 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -43,19 +43,19 @@ class AdamOptimizer(optimizer.Optimizer): Initialization: - $$m_0 \Leftarrow 0 (Initialize initial 1st moment vector)$$ - $$v_0 \Leftarrow 0 (Initialize initial 2nd moment vector)$$ - $$t \Leftarrow 0 (Initialize timestep)$$ + $$m_0 := 0 (Initialize initial 1st moment vector)$$ + $$v_0 := 0 (Initialize initial 2nd moment vector)$$ + $$t := 0 (Initialize timestep)$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - $$t \Leftarrow t + 1$$ - $$lr_t \Leftarrow \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$t := t + 1$$ + $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - $$m_t \Leftarrow beta_1 * m_{t-1} + (1 - beta_1) * g$$ - $$v_t \Leftarrow beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ - $$variable \Leftarrow variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a From b6686d2808b40ed985db2151bcf31961b53e49f5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 13:09:07 -0700 Subject: [PATCH 0450/1734] Collective Ops Part 4 Add Broadcaster. A few minor adjustments to CollectiveParams and RMA. This change is part of a series of changes introducing infrastructure for collective ops and initial implementations of reduction and broadcast. PiperOrigin-RevId: 193562391 --- tensorflow/core/BUILD | 30 + .../base_collective_executor.cc | 81 +- .../common_runtime/base_collective_executor.h | 7 + tensorflow/core/common_runtime/broadcaster.cc | 249 ++++++ tensorflow/core/common_runtime/broadcaster.h | 66 ++ .../core/common_runtime/broadcaster_test.cc | 741 ++++++++++++++++++ .../collective_param_resolver_local.cc | 42 +- .../collective_param_resolver_local_test.cc | 8 +- .../common_runtime/collective_rma_local.h | 2 + tensorflow/core/framework/collective.cc | 15 +- tensorflow/core/framework/collective.h | 7 +- 11 files changed, 1220 insertions(+), 28 deletions(-) create mode 100644 tensorflow/core/common_runtime/broadcaster.cc create mode 100644 tensorflow/core/common_runtime/broadcaster.h create mode 100644 tensorflow/core/common_runtime/broadcaster_test.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 54e7ab31d75..c15e7de186f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2256,6 +2256,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/allocator_retry.h", "common_runtime/base_collective_executor.h", "common_runtime/bfc_allocator.h", + "common_runtime/broadcaster.h", "common_runtime/buf_rendezvous.h", "common_runtime/build_graph_options.h", "common_runtime/collective_executor_mgr.h", @@ -2303,6 +2304,7 @@ tf_cuda_library( "common_runtime/allocator_retry.cc", "common_runtime/base_collective_executor.cc", "common_runtime/bfc_allocator.cc", + "common_runtime/broadcaster.cc", "common_runtime/buf_rendezvous.cc", "common_runtime/build_graph_options.cc", "common_runtime/collective_executor_mgr.cc", @@ -3140,6 +3142,34 @@ tf_cc_tests_gpu( ], ) +tf_cc_tests_gpu( + name = "broadcaster_test", + size = "small", + srcs = [ + "common_runtime/broadcaster_test.cc", + ], + linkstatic = tf_kernel_tests_linkstatic(), + tags = tf_cuda_tests_tags(), + deps = [ + ":all_kernels", + ":core", + ":core_cpu", + ":core_cpu_internal", + ":direct_session_internal", + ":framework", + ":framework_internal", + ":gpu_runtime", + ":lib", + ":lib_internal", + ":ops", + ":protos_all_cc", + ":protos_test_cc", + ":test", + ":test_main", + ":testlib", + ], +) + tf_cc_test_mkl( name = "mkl_runtime_tests", size = "small", diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc index f6332fabdb3..637b43c844b 100644 --- a/tensorflow/core/common_runtime/base_collective_executor.cc +++ b/tensorflow/core/common_runtime/base_collective_executor.cc @@ -14,14 +14,13 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/base_collective_executor.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/common_runtime/broadcaster.h" #include "tensorflow/core/common_runtime/copy_tensor.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/common_runtime/process_util.h" #include "tensorflow/core/common_runtime/ring_reducer.h" #include "tensorflow/core/lib/core/notification.h" -#include "tensorflow/core/lib/strings/str_util.h" #define VALUE_IN_DEBUG_STRING false @@ -194,37 +193,68 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params, const string& exec_key, StatusCallback done) { - const Tensor* input = &ctx->input(0); + // On any individual collective Op failure we need to abort the + // BufRendezvous so that other Ops in the instance don't hang + // waiting for transmissions that will never happen. Do so after a + // delay so that the original error status is more likely to + // propagate up, and peers are unlikely to re-create the purged + // BufRendezvous by late-arriving requests. + StatusCallback done_safe = [this, done](const Status& s) { + if (!s.ok()) { + Ref(); // Ensure this lasts until the closure executes. + SchedNonBlockingClosureAfter(1000000, [this, s] { + remote_access_->buf_rendezvous()->StartAbort(s); + Unref(); + }); + } + done(s); + }; + Tensor* output = ctx->mutable_output(0); string error; switch (col_params.instance.type) { case REDUCTION_COLLECTIVE: { // TODO(tucker): support other reduction algorithms, // e.g. tree-reduce, hybrid tree/ring, delegate-to-NCCL, etc. + const Tensor* input = &ctx->input(0); RingReducer* reducer = CreateReducer(ctx, CtxParams(ctx), col_params, exec_key, step_id_, input, output, &error); if (!reducer) { - done(errors::Internal(error)); + done_safe(errors::Internal(error)); return; } // Run in an I/O thread, so as not to starve the executor threads. // TODO(tucker): Instead of forking every per-device Collective // Op off into its own thread, consider queuing them on a // fixed-size thread-pool dedicated to running CollectiveOps. - SchedClosure([reducer, done]() { - reducer->Run([reducer, done](const Status& s) { - done(s); + SchedClosure([reducer, done_safe]() { + reducer->Run([reducer, done_safe](const Status& s) { + done_safe(s); delete reducer; }); }); } break; - case BROADCAST_COLLECTIVE: - done(errors::Internal("Collective Broadcast unimplemented")); - break; + + case BROADCAST_COLLECTIVE: { + Broadcaster* broadcaster = CreateBroadcaster( + ctx, CtxParams(ctx), col_params, exec_key, step_id_, output, &error); + if (!broadcaster) { + done_safe(errors::Internal(error)); + return; + } + // Run in an I/O thread, so as not to starve the executor threads. + SchedClosure([broadcaster, done_safe]() { + broadcaster->Run([broadcaster, done_safe](const Status& s) { + done_safe(s); + delete broadcaster; + }); + }); + } break; + default: - done(errors::Internal("Unimplemented CollectiveType ", - col_params.instance.type)); + done_safe(errors::Internal("Unimplemented CollectiveType ", + col_params.instance.type)); } } @@ -254,4 +284,31 @@ RingReducer* BaseCollectiveExecutor::CreateReducer( } } +Broadcaster* BaseCollectiveExecutor::CreateBroadcaster( + OpKernelContext* ctx, OpKernelContext::Params* params, + const CollectiveParams& col_params, const string& exec_key, int64 step_id, + Tensor* output, string* error) { + switch (col_params.instance.data_type) { + case DT_INT32: + if (col_params.group.device_type == DEVICE_GPU) { + *error = + "Collective Broadcast does not support datatype DT_INT32 on " + "DEVICE_GPU"; + return nullptr; + } + TF_FALLTHROUGH_INTENDED; + case DT_FLOAT: + case DT_DOUBLE: + case DT_INT64: { + return new Broadcaster(this, dev_mgr_, ctx, params, col_params, exec_key, + step_id, output); + } break; + default: + *error = + strings::StrCat("Collective Broadcast does not support datatype ", + DataTypeString(col_params.instance.data_type)); + return nullptr; + } +} + } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h index 58eaf31f710..462d6b75331 100644 --- a/tensorflow/core/common_runtime/base_collective_executor.h +++ b/tensorflow/core/common_runtime/base_collective_executor.h @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/device_attributes.pb.h" namespace tensorflow { +class Broadcaster; class DeviceMgr; class RingReducer; @@ -138,6 +139,12 @@ class BaseCollectiveExecutor : public CollectiveExecutor { const string& exec_key, int64 step_id, const Tensor* input, Tensor* output, string* error); + + Broadcaster* CreateBroadcaster(OpKernelContext* ctx, + OpKernelContext::Params* params, + const CollectiveParams& col_params, + const string& exec_key, int64 step_id, + Tensor* output, string* error); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc new file mode 100644 index 00000000000..5e8af8653dc --- /dev/null +++ b/tensorflow/core/common_runtime/broadcaster.cc @@ -0,0 +1,249 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/broadcaster.h" + +#include "tensorflow/core/common_runtime/collective_rma_local.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/platform/env.h" + +// Set true for greater intelligibility of debug mode log messages. +#define READABLE_KEYS false + +namespace tensorflow { + +namespace { +// Key to be used for BufRendezvous by Broadcaster. +string BroadcastBufKey(const string& exec_key, int src_rank, int dst_rank) { + if (READABLE_KEYS) { + return strings::StrCat("broadcast(", exec_key, "):src(", src_rank, "):dst(", + dst_rank, ")"); + } else { + // TODO(tucker): Try a denser format, e.g. a 64 or 128 bit hash. + return strings::StrCat(exec_key, ":", src_rank, ":", dst_rank); + } +} +} // namespace + +Broadcaster::Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, + OpKernelContext* ctx, OpKernelContext::Params* params, + const CollectiveParams& col_params, + const string& exec_key, int64 step_id, Tensor* output) + : col_exec_(col_exec), + dev_mgr_(dev_mgr), + ctx_(ctx), + col_params_(col_params), + exec_key_(exec_key), + rank_(col_params.subdiv_rank[0]), + is_source_(col_params.is_source), + output_(output), + done_(nullptr), + device_(nullptr) {} + +void Broadcaster::Run(StatusCallback done) { + // The optimal data transfer choreography is going to very platform dependent. + // That will be addressed by later improvements here or by platform-specific + // overrides of collective broadcast. The initial version is simply + // a binary tree that completely ignores DeviceLocality. + done_ = std::move(done); + + // Get the device for which we're executing and look up its locality. + status_ = dev_mgr_->LookupDevice( + col_params_.instance.device_names[col_params_.default_rank], &device_); + if (!status_.ok()) { + done_(status_); + return; + } + CHECK(device_); + device_locality_ = device_->attributes().locality(); + + RunTree(); +} + +// Binary tree parent/child relations are trivial to calculate, i.e. +// device at rank r is the parent of 2r+1 and 2r+2. The one exception +// is if the source is not rank 0. We treat that case as though the +// source is appended to the front of the rank ordering as well as +// continuing to occupy its current position. Hence we calculate as +// though each device's rank is actually r+1, then subtract 1 again to +// get the descendent ranks. If the source is not rank 0 then its +// decendents include both {0,1} and the descendents of its current +// position. Where a non-0-rank source is a descendent of another +// device, no send to it is necessary. + +/* static*/ +int Broadcaster::TreeRecvFrom(const CollectiveParams& cp) { + DCHECK_EQ(1, cp.subdiv_rank.size()); + if (cp.is_source) return -1; + int source_rank = cp.instance.impl_details.subdiv_source_rank[0]; + int my_rank = cp.subdiv_rank[0]; + if (source_rank == 0) { + return (my_rank - 1) / 2; + } else { + int predecessor_rank = (my_rank / 2) - 1; + return (predecessor_rank < 0) ? source_rank : predecessor_rank; + } +} + +/* static */ +void Broadcaster::TreeSendTo(const CollectiveParams& cp, + std::vector* targets) { + DCHECK_EQ(1, cp.subdiv_rank.size()); + targets->clear(); + int my_rank = cp.subdiv_rank[0]; + DCHECK_EQ(1, cp.instance.impl_details.subdiv_source_rank.size()); + int source_rank = cp.instance.impl_details.subdiv_source_rank[0]; + int successor_rank = 0; + if (source_rank == 0) { + successor_rank = (2 * my_rank) + 1; + } else { + successor_rank = (2 * (my_rank + 1)); + } + DCHECK_NE(successor_rank, my_rank); + if (cp.is_source && source_rank != 0) { + // The source sends to rank 0,1 in addition to its positional + // decendents. + if (cp.group.group_size > 1) { + targets->push_back(0); + } + if (cp.group.group_size > 2 && source_rank != 1) { + targets->push_back(1); + } + } + for (int i = 0; i < 2; ++i) { + if (successor_rank < cp.group.group_size && successor_rank != source_rank) { + targets->push_back(successor_rank); + } + ++successor_rank; + } +} + +// Execute a tree broadcast, i.e. each non-source device receives from +// one other and sends to up-to two others. +void Broadcaster::RunTree() { + mutex mu; + int pending_count = 0; // GUARDED_BY(mu) + condition_variable all_done; + std::vector send_to_ranks; + TreeSendTo(col_params_, &send_to_ranks); + + if (!is_source_) { + // Begin by receiving the value. + int recv_from_rank = TreeRecvFrom(col_params_); + Notification note; + DispatchRecv(recv_from_rank, output_, + [this, recv_from_rank, &mu, ¬e](const Status& s) { + mutex_lock l(mu); + status_.Update(s); + note.Notify(); + }); + note.WaitForNotification(); + } + + // Then forward value to all descendent devices. + if (status_.ok()) { + for (int i = 0; i < send_to_ranks.size(); ++i) { + int target_rank = send_to_ranks[i]; + { + mutex_lock l(mu); + ++pending_count; + } + DispatchSend( + target_rank, output_, + [this, target_rank, &mu, &pending_count, &all_done](const Status& s) { + status_.Update(s); + { + mutex_lock l(mu); + --pending_count; + if (pending_count == 0) { + all_done.notify_all(); + } + } + }); + } + } + + if (status_.ok() && is_source_) { + // Meanwhile, copy input to output if we weren't lucky enough to + // be able to reuse input as output. + const Tensor* input = &ctx_->input(0); + if (input != output_ && + (DMAHelper::base(input) != DMAHelper::base(output_))) { + { + mutex_lock l(mu); + ++pending_count; + } + DeviceContext* op_dev_ctx = ctx_->op_device_context(); + CollectiveRemoteAccessLocal::MemCpyAsync( + op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0), + ctx_->output_alloc_attr(0), input, output_, + [this, &mu, &pending_count, &all_done](const Status& s) { + status_.Update(s); + { + mutex_lock l(mu); + --pending_count; + if (0 == pending_count) { + all_done.notify_all(); + } + } + }); + } + } + + // Then wait for all pending actions to complete. + { + mutex_lock l(mu); + if (pending_count > 0) { + all_done.wait(l); + } + } + + VLOG(2) << "return status " << status_; + done_(status_); +} + +void Broadcaster::DispatchSend(int dst_rank, const Tensor* src_tensor, + const StatusCallback& done) { + string send_buf_key = BroadcastBufKey(exec_key_, rank_, dst_rank); + VLOG(1) << "DispatchSend " << send_buf_key << " from_device " + << device_->name(); + int dst_idx = + col_params_.instance.impl_details.subdiv_permutations[0][dst_rank]; + col_exec_->PostToPeer(col_params_.instance.device_names[dst_idx], + col_params_.instance.task_names[dst_idx], send_buf_key, + device_, ctx_->op_device_context(), + ctx_->output_alloc_attr(0), src_tensor, + device_locality_, done); +} + +void Broadcaster::DispatchRecv(int src_rank, Tensor* dst_tensor, + const StatusCallback& done) { + string recv_buf_key = BroadcastBufKey(exec_key_, src_rank, rank_); + int src_idx = + col_params_.instance.impl_details.subdiv_permutations[0][src_rank]; + VLOG(1) << "DispatchRecv " << recv_buf_key << " from_device " + << col_params_.instance.device_names[src_idx]; + int dst_idx = col_params_.instance.impl_details.subdiv_permutations[0][rank_]; + CHECK_EQ(col_params_.instance.device_names[dst_idx], device_->name()); + col_exec_->RecvFromPeer(col_params_.instance.device_names[src_idx], + col_params_.instance.task_names[src_idx], + col_params_.task.is_local[src_idx], recv_buf_key, + device_, ctx_->op_device_context(), + ctx_->output_alloc_attr(0), dst_tensor, + device_locality_, done); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/broadcaster.h b/tensorflow/core/common_runtime/broadcaster.h new file mode 100644 index 00000000000..bdf68f19abd --- /dev/null +++ b/tensorflow/core/common_runtime/broadcaster.h @@ -0,0 +1,66 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ + +#include +#include "tensorflow/core/common_runtime/base_collective_executor.h" +#include "tensorflow/core/framework/collective.h" +#include "tensorflow/core/framework/device_attributes.pb.h" + +namespace tensorflow { + +// Tree-algorithm implementation of collective broadcast. +class Broadcaster { + public: + Broadcaster(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr, + OpKernelContext* ctx, OpKernelContext::Params* params, + const CollectiveParams& col_params, const string& exec_key, + int64 step_id, Tensor* output); + + void Run(StatusCallback done); + + // Returns the rank of the device from which this device should receive + // its value, -1 if no value should be received. + static int TreeRecvFrom(const CollectiveParams& cp); + + // Populates targets with the ranks of the devices to which this device + // should forward the value. + static void TreeSendTo(const CollectiveParams& cp, std::vector* targets); + + private: + void DispatchSend(int dst_rank, const Tensor* src_tensor, + const StatusCallback& done); + void DispatchRecv(int src_rank, Tensor* dst_tensor, + const StatusCallback& done); + void RunTree(); + + Status status_; + CollectiveExecutor* col_exec_; // Not owned + const DeviceMgr* dev_mgr_; // Not owned + OpKernelContext* ctx_; // Not owned + const CollectiveParams& col_params_; + const string exec_key_; + const int rank_; + const bool is_source_; + Tensor* output_; // Not owned + std::unique_ptr ca_; + StatusCallback done_; + Device* device_; // The device for which this instance labors + DeviceLocality device_locality_; +}; + +} // namespace tensorflow +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_BROADCASTER_H_ diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc new file mode 100644 index 00000000000..89d39144b3d --- /dev/null +++ b/tensorflow/core/common_runtime/broadcaster_test.cc @@ -0,0 +1,741 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/broadcaster.h" + +#include +#include "tensorflow/core/common_runtime/base_collective_executor.h" +#include "tensorflow/core/common_runtime/collective_rma_local.h" +#include "tensorflow/core/common_runtime/device_mgr.h" +#include "tensorflow/core/common_runtime/device_resolver_local.h" +#include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/common_runtime/process_util.h" +#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h" +#include "tensorflow/core/common_runtime/threadpool_device.h" +#include "tensorflow/core/framework/collective.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/version.h" + +namespace tensorflow { +namespace { + +static int64 kStepId = 123; +static int32 kNumSubdivs = 1; // Subdiv not yet meaningful for broadcast + +// The test harness won't allow a mixture of fixture and non-fixture +// tests in one file, so this is a trival fixture for tests that don't +// need the heavy-weight BroadcasterTest fixture. +class TrivialTest : public ::testing::Test { + protected: + TrivialTest() {} +}; + +// Tests of static TreeSendTo() and TreeRecvFrom() functions. +// D = number of devices +// S = source rank +// R = tested rank +// RF = receive-from rank +// ST = send_to rank vector +#define DEF_TL_TEST(D, S, R, RF, ST) \ + TEST_F(TrivialTest, TreeLinks_##D##Devs_##S##Source_##R##Rank) { \ + CollectiveParams cp; \ + cp.group.group_size = D; \ + cp.instance.impl_details.subdiv_source_rank = {S}; \ + cp.subdiv_rank = {R}; \ + cp.is_source = (S == R); \ + EXPECT_EQ(RF, Broadcaster::TreeRecvFrom(cp)); \ + std::vector expected = ST; \ + std::vector send_to; \ + Broadcaster::TreeSendTo(cp, &send_to); \ + ASSERT_EQ(expected.size(), send_to.size()); \ + for (int i = 0; i < expected.size(); ++i) { \ + EXPECT_EQ(expected[i], send_to[i]); \ + } \ + } + +#define V(...) std::vector({__VA_ARGS__}) + +// D S R RF ST +// 2 device cases +DEF_TL_TEST(2, 0, 0, -1, V(1)) +DEF_TL_TEST(2, 1, 0, 1, V()) +DEF_TL_TEST(2, 0, 1, 0, V()) +DEF_TL_TEST(2, 1, 1, -1, V(0)) +// 3 device cases +DEF_TL_TEST(3, 0, 0, -1, V(1, 2)) +DEF_TL_TEST(3, 0, 1, 0, V()) +DEF_TL_TEST(3, 0, 2, 0, V()) +DEF_TL_TEST(3, 1, 0, 1, V(2)) +DEF_TL_TEST(3, 1, 1, -1, V(0)) +DEF_TL_TEST(3, 1, 2, 0, V()) +DEF_TL_TEST(3, 2, 0, 2, V()) +DEF_TL_TEST(3, 2, 1, 2, V()) +DEF_TL_TEST(3, 2, 2, -1, V(0, 1)) +// 4 device cases +DEF_TL_TEST(4, 0, 0, -1, V(1, 2)) +DEF_TL_TEST(4, 0, 1, 0, V(3)) +DEF_TL_TEST(4, 0, 2, 0, V()) +DEF_TL_TEST(4, 0, 3, 1, V()) +DEF_TL_TEST(4, 1, 0, 1, V(2, 3)) +DEF_TL_TEST(4, 1, 1, -1, V(0)) +DEF_TL_TEST(4, 1, 2, 0, V()) +DEF_TL_TEST(4, 1, 3, 0, V()) +DEF_TL_TEST(4, 2, 0, 2, V(3)) +DEF_TL_TEST(4, 2, 1, 2, V()) +DEF_TL_TEST(4, 2, 2, -1, V(0, 1)) +DEF_TL_TEST(4, 2, 3, 0, V()) +DEF_TL_TEST(4, 3, 0, 3, V(2)) +DEF_TL_TEST(4, 3, 1, 3, V()) +DEF_TL_TEST(4, 3, 2, 0, V()) +DEF_TL_TEST(4, 3, 3, -1, V(0, 1)) +// 8 device cases +// D S R RF ST +DEF_TL_TEST(8, 0, 0, -1, V(1, 2)) +DEF_TL_TEST(8, 0, 1, 0, V(3, 4)) +DEF_TL_TEST(8, 0, 2, 0, V(5, 6)) +DEF_TL_TEST(8, 0, 3, 1, V(7)) +DEF_TL_TEST(8, 0, 4, 1, V()) +DEF_TL_TEST(8, 0, 5, 2, V()) +DEF_TL_TEST(8, 0, 6, 2, V()) +DEF_TL_TEST(8, 0, 7, 3, V()) +DEF_TL_TEST(8, 7, 0, 7, V(2, 3)) +DEF_TL_TEST(8, 7, 1, 7, V(4, 5)) +DEF_TL_TEST(8, 7, 2, 0, V(6)) +DEF_TL_TEST(8, 7, 3, 0, V()) +DEF_TL_TEST(8, 7, 4, 1, V()) +DEF_TL_TEST(8, 7, 5, 1, V()) +DEF_TL_TEST(8, 7, 6, 2, V()) +DEF_TL_TEST(8, 7, 7, -1, V(0, 1)) +#undef DEF_TL_TEST +#undef V + +// Wraps CollectiveRemoteAccessLocal with the ability to return an +// error status to the N'th action. +// TODO(tucker): factor out of this file and ring_reducer_test.cc +// into a single common source. +class FailTestRMA : public CollectiveRemoteAccessLocal { + public: + FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver, + int64 step_id, int fail_after) + : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id), + fail_after_(fail_after) {} + + bool MaybeFail(const StatusCallback& done) { + bool fail_now = false; + { + mutex_lock l(mu_); + if (fail_after_ > 0) { + fail_now = (--fail_after_ == 0); + } + } + if (fail_now) { + auto error = errors::Internal("Deliberate failure"); + LOG(INFO) << "triggering failure " << error; + SchedNonBlockingClosureAfter( + 1000, [this, error] { buf_rendezvous()->StartAbort(error); }); + done(error); + return true; + } + return false; + } + + void RecvFromPeer(const string& peer_device, const string& peer_task, + bool peer_is_local, const string& key, Device* to_device, + DeviceContext* to_device_ctx, + const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor, + const DeviceLocality& client_locality, + const StatusCallback& done) override { + if (MaybeFail(done)) return; + CollectiveRemoteAccessLocal::RecvFromPeer( + peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx, + to_alloc_attr, to_tensor, client_locality, done); + } + + void PostToPeer(const string& peer_device, const string& peer_task, + const string& key, Device* from_device, + DeviceContext* from_device_ctx, + const AllocatorAttributes& from_alloc_attr, + const Tensor* from_tensor, + const DeviceLocality& client_locality, + const StatusCallback& done) override { + if (MaybeFail(done)) return; + CollectiveRemoteAccessLocal::PostToPeer( + peer_device, peer_task, key, from_device, from_device_ctx, + from_alloc_attr, from_tensor, client_locality, done); + } + + mutex mu_; + int fail_after_ GUARDED_BY(mu_); +}; + +class BroadcasterTest : public ::testing::Test { + protected: + BroadcasterTest() : device_type_(DEVICE_CPU) {} + + ~BroadcasterTest() override { + stop_ = true; + for (auto i : instances_) { + delete i; + } + if (col_exec_) col_exec_->Unref(); + } + + void SetUp() override { +#if GOOGLE_CUDA + auto device_factory = DeviceFactory::GetFactory("GPU"); + CHECK(device_factory); + SessionOptions options; + Status s = device_factory->CreateDevices( + options, "/job:worker/replica:0/task:0", &gpu_devices_); + CHECK(s.ok()); +#endif + } + + void Init(int num_workers, int num_devices, DataType dtype, + const DeviceType& device_type, int fail_after) { + device_type_ = device_type; + std::vector local_devices; + SessionOptions sess_opts; + sess_opts.env = Env::Default(); + Bytes mem_limit(4 << 20); + DeviceLocality dev_locality; + for (int wi = 0; wi < num_workers; ++wi) { + for (int di = 0; di < num_devices; ++di) { + if (device_type == DEVICE_CPU) { + string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi, + "/device:CPU:", di); + local_devices.push_back(new ThreadPoolDevice( + sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator())); + } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) { + int dev_idx = (wi * num_devices) + di; + if (dev_idx >= static_cast(gpu_devices_.size())) { + LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more " + "than one ring node."; + } else { + local_devices.push_back(gpu_devices_[dev_idx]); + } + } else { + LOG(FATAL) << "Unsupported device_type " << device_type; + } + } + } + if (!dev_mgr_ || device_type == DEVICE_CPU) { + dev_mgr_.reset(new DeviceMgr(local_devices)); + } + dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get())); + rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId, + fail_after); + col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId, + dev_mgr_.get()); + col_params_.name = "test_collective"; + col_params_.instance.data_type = dtype; + static const int kGroupKey = 5; + col_params_.group.group_key = kGroupKey; + static const int kInstanceKey = 17; + col_params_.instance.instance_key = kInstanceKey; + col_params_.group.device_type = device_type; + col_params_.group.group_size = num_workers * num_devices; + col_params_.instance.impl_details.subdiv_offsets.clear(); + col_params_.instance.type = BROADCAST_COLLECTIVE; + col_params_.instance.impl_details.subdiv_permutations.resize(kNumSubdivs); + col_params_.subdiv_rank.resize(kNumSubdivs); + int subdiv_stride = num_devices / kNumSubdivs; + for (int sdi = 0; sdi < kNumSubdivs; ++sdi) { + col_params_.instance.impl_details.subdiv_offsets.push_back(sdi * + subdiv_stride); + col_params_.subdiv_rank[sdi] = sdi * subdiv_stride; + } + + // Set up a local device ring order that's not just 0,1,2... + std::vector local_ring_order; + for (int di = 0; di < num_devices; ++di) { + local_ring_order.push_back(di); + } + for (int di = 0; di < num_devices; ++di) { + bool is_odd = ((di % 2) == 1); + int other = (di + (is_odd ? 7 : 3)) % num_devices; + if (di == other) continue; + iter_swap(local_ring_order.begin() + di, + local_ring_order.begin() + other); + } + broadcast_dev_id_ = local_ring_order[0]; + string lro_buf; + for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", "); + VLOG(1) << "local_ring_order " << lro_buf; + + // Set up all of the fake device contexts. + for (int wi = 0; wi < num_workers; ++wi) { + for (int di = 0; di < num_devices; ++di) { + string task_name = strings::StrCat("/job:worker/replica:0/task:", wi); + string dev_name = strings::StrCat(task_name, "/device:CPU:", di); + if (device_type == DEVICE_GPU) { + dev_name = strings::StrCat(task_name, "/device:GPU:0"); + } + col_params_.instance.device_names.push_back(dev_name); + col_params_.instance.task_names.push_back(task_name); + // Normally each device would set is_local to its own perspective but + // this test runs in a single process so is_local is always true. + col_params_.task.is_local.push_back(true); + for (int sdi = 0; sdi < kNumSubdivs; ++sdi) { + int rotated_di = + (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) % + num_devices; + col_params_.instance.impl_details.subdiv_permutations[sdi].push_back( + wi * num_devices + local_ring_order[rotated_di]); + } + } + } + for (int wi = 0; wi < num_workers; ++wi) { + for (int di = 0; di < num_devices; ++di) { + int rank = wi * num_devices + di; + instances_.push_back(new DeviceInstance( + rank, col_params_.instance.device_names[rank], device_type_, this)); + } + } + } + + typedef std::function InitFunc; + + void Broadcast() { + std::atomic done(0); + for (auto di : instances_) { + SchedClosure([di, &done] { + di->DoBroadcast(); + ++done; + }); + } + while (done < instances_.size()) { + if (stop_) break; + Env::Default()->SleepForMicroseconds(1000); + } + } + + std::unique_ptr GetKernel(const NodeDef& node, + const DeviceType& device_type, + DeviceBase* device) { + Status status; + std::unique_ptr k = CreateOpKernel( + device_type, device, device->GetAllocator(AllocatorAttributes()), node, + TF_GRAPH_DEF_VERSION, &status); + if (!status.ok()) { + LOG(FATAL) << status; + } + return k; + } + + std::unique_ptr GetCollectiveBcastSend( + const CollectiveParams& params, Tensor* input, + const DeviceType& device_type, DeviceBase* device) { + mutex_lock l(mu_); + NodeDef node_def; + NodeDefBuilder builder( + strings::StrCat("collective_bcast_send_", bcast_send_counter_++), + "CollectiveBcastSend"); + TF_CHECK_OK(builder.Attr("T", input->dtype()) + .Attr("group_size", params.group.group_size) + .Attr("group_key", params.group.group_key) + .Attr("instance_key", params.instance.instance_key) + .Attr("shape", input->shape()) + .Input(FakeInput(params.instance.data_type)) + .Finalize(&node_def)); + return GetKernel(node_def, device_type, device); + } + + std::unique_ptr GetCollectiveBcastRecv( + const CollectiveParams& params, const TensorShape& shape, + const DeviceType& device_type, DeviceBase* device) { + mutex_lock l(mu_); + NodeDef node_def; + NodeDefBuilder builder( + strings::StrCat("collective_bcast_recv_", bcast_recv_counter_++), + "CollectiveBcastRecv"); + TF_CHECK_OK(builder.Attr("T", params.instance.data_type) + .Attr("group_size", params.group.group_size) + .Attr("group_key", params.group.group_key) + .Attr("instance_key", params.instance.instance_key) + .Attr("shape", shape) + .Finalize(&node_def)); + return GetKernel(node_def, device_type, device); + } + + void BuildColParams() {} + + template + void RunTest(DataType dtype, const DeviceType& device_type, int num_workers, + int num_devices, int tensor_len, int fail_after) { + Init(num_workers, num_devices, dtype, device_type, fail_after); + + // Initialize each instance tensor with distinct values. + for (int di = 0; di < instances_.size(); ++di) { + DeviceInstance* instance = instances_[di]; + instance->InitTensor( + dtype, TensorShape({tensor_len}), [di, dtype](Tensor* t) { + for (size_t i = 0; i < t->NumElements(); ++i) { + // The cast is necessary to prevent clang-tidy from insisting + // that a faster non-open source function be substituted. + float value = pow(10, static_cast(di)) * i; + t->flat()(i) = value; + } + }); + } + + // Copy the expected value from the broadcast source tensor + std::vector expected(tensor_len, 0.0); + const CollectiveParams& cp = instances_[0]->col_params_; + int broadcast_dev_id = + cp.instance.impl_details.subdiv_permutations + [0][cp.instance.impl_details.subdiv_source_rank[0]]; + const Tensor* t = &instances_[broadcast_dev_id]->tensor_; + Tensor cpu_copy(dtype, TensorShape({tensor_len})); + if (device_type == DEVICE_GPU) { + Notification notification; + Device* dev = instances_[broadcast_dev_id]->device_; + auto* dev_info = dev->tensorflow_gpu_device_info(); + CHECK(dev_info); + dev_info->default_context->CopyDeviceTensorToCPU( + t, "" /*tensor_name*/, dev, &cpu_copy, + [this, ¬ification](Status s) { + TF_CHECK_OK(s); + notification.Notify(); + }); + notification.WaitForNotification(); + t = &cpu_copy; + } + for (size_t i = 0; i < t->NumElements(); ++i) { + expected[i] = t->flat()(i); + } + + Broadcast(); + + // At this point all of the ops have terminated. + for (int di = 0; di < instances_.size(); ++di) { + if (!instances_[di]->status_.ok()) { + ASSERT_GT(fail_after, 0); + ASSERT_EQ(instances_[di]->status_.error_message(), + "Deliberate failure"); + mutex_lock l(mu_); + ++failure_count_; + continue; + } + Tensor* inst = &instances_[di]->tensor_; + Tensor actual(dtype, TensorShape({tensor_len})); + if (device_type_ == DEVICE_CPU) { + CHECK(actual.CopyFrom(*inst, inst->shape())); + } else if (device_type_ == DEVICE_GPU) { + Notification notification; + Device* dev = instances_[di]->device_; + auto* dev_info = dev->tensorflow_gpu_device_info(); + CHECK(dev_info); + dev_info->default_context->CopyDeviceTensorToCPU( + inst, "" /*tensor_name*/, dev, &actual, + [this, ¬ification](Status s) { + TF_CHECK_OK(s); + notification.Notify(); + }); + notification.WaitForNotification(); + } + for (int i = 0; i < tensor_len; ++i) { + switch (dtype) { + case DT_FLOAT: + EXPECT_FLOAT_EQ(expected[i], actual.template flat()(i)) + << "Mismatch at device " << di << " index " << i; + break; + case DT_DOUBLE: + EXPECT_DOUBLE_EQ(expected[i], actual.template flat()(i)) + << "Mismatch at device " << di << " index " << i; + break; + case DT_INT32: + case DT_INT64: + EXPECT_EQ(expected[i], actual.template flat()(i)) + << "Mismatch at device " << di << " index " << i; + break; + default: + LOG(FATAL) << "unimplemented"; + } + } + } + + // Note that the order of operations during broadcast is + // non-deterministic and unlike the reduce case some Ops in the + // instance may succeed while others fail, even if a transmission + // failure occurs early in the operation chain. So, when an abort + // is specified we need to verify that at least one Op fails with + // the expected status and any Op that succeeds yeilds the correct + // value. + if (fail_after > 0) { + mutex_lock l(mu_); + EXPECT_GT(failure_count_, 0); + } + } + + class DeviceInstance { + public: + DeviceInstance(int rank, const string& dev_name, + const DeviceType& device_type, BroadcasterTest* parent) + : parent_(parent), + dev_name_(dev_name), + device_type_(device_type), + rank_(rank) { + TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_)); + col_params_.name = parent_->col_params_.name; + col_params_.instance.data_type = parent_->col_params_.instance.data_type; + col_params_.group.group_key = parent_->col_params_.group.group_key; + col_params_.instance.instance_key = + parent_->col_params_.instance.instance_key; + col_params_.group.device_type = parent_->col_params_.group.device_type; + col_params_.group.group_size = parent_->col_params_.group.group_size; + col_params_.instance.device_names = + parent_->col_params_.instance.device_names; + col_params_.instance.task_names = + parent_->col_params_.instance.task_names; + col_params_.task.is_local = parent_->col_params_.task.is_local; + col_params_.instance.impl_details.subdiv_permutations = + parent_->col_params_.instance.impl_details.subdiv_permutations; + col_params_.subdiv_rank = parent_->col_params_.subdiv_rank; + + int group_size = col_params_.group.group_size; + CHECK_EQ(group_size, col_params_.instance.device_names.size()); + // Default rank is order in device_names. + col_params_.default_rank = rank; + // perm_rank is order in subdiv[0]: + int perm_rank = -1; + for (int i = 0; + i < col_params_.instance.impl_details.subdiv_permutations[0].size(); + ++i) { + if (rank == + col_params_.instance.impl_details.subdiv_permutations[0][i]) { + perm_rank = i; + break; + } + } + CHECK_GE(perm_rank, 0); + col_params_.instance.impl_details.subdiv_source_rank.resize(1, 0); + col_params_.is_source = + (perm_rank == + col_params_.instance.impl_details.subdiv_source_rank[0]); + // Set rank in all subdivs by finding that default_rank. + for (int sdi = 0; sdi < kNumSubdivs; ++sdi) { + for (int r = 0; + r < + col_params_.instance.impl_details.subdiv_permutations[sdi].size(); + ++r) { + if (col_params_.default_rank == + col_params_.instance.impl_details.subdiv_permutations[sdi][r]) { + col_params_.subdiv_rank[sdi] = r; + CHECK_EQ(0, sdi); + CHECK_EQ(perm_rank, col_params_.subdiv_rank[sdi]); + break; + } + } + } + CHECK_EQ(group_size, col_params_.task.is_local.size()); + CHECK_EQ(group_size, col_params_.instance.task_names.size()); + } + + void InitTensor(DataType dtype, const TensorShape& shape, + const InitFunc& f) { + tensor_ = + Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape); + if (device_type_ == DEVICE_CPU) { + f(&tensor_); + } else if (device_type_ == DEVICE_GPU) { + Tensor cpu_tensor(dtype, shape); + f(&cpu_tensor); + Notification notification; + auto* dev_info = device_->tensorflow_gpu_device_info(); + CHECK(dev_info); + dev_info->default_context->CopyCPUTensorToDevice( + &cpu_tensor, device_, &tensor_, [this, ¬ification](Status s) { + TF_CHECK_OK(s); + notification.Notify(); + }); + notification.WaitForNotification(); + } else { + LOG(FATAL) << "Unsupported device_type " << device_type_; + } + } + + void DoBroadcast() { + // Prepare an OpKernelContext. + OpKernelContext::Params op_params; + op_params.step_id = parent_->step_id_; + op_params.device = device_; + gtl::InlinedVector inputs; + inputs.push_back(TensorValue(&tensor_)); + op_params.inputs = &inputs; + gtl::InlinedVector input_aa( + {AllocatorAttributes()}); + op_params.input_alloc_attrs = &input_aa; + gtl::InlinedVector input_dc; + DeviceContext* dev_ctx = nullptr; + auto* dev_info = device_->tensorflow_gpu_device_info(); + if (dev_info) { + dev_ctx = dev_info->default_context; + dev_ctx->Ref(); + } else { + dev_ctx = new DeviceContext; + } + input_dc.push_back(dev_ctx); + op_params.input_device_contexts = &input_dc; + op_params.op_device_context = dev_ctx; + int forward_from[] = {0}; + if (col_params_.is_source) { + op_params.forward_from_array = &forward_from[0]; + } + AllocatorAttributes generic_alloc_attr; + op_params.output_attr_array = &generic_alloc_attr; + std::unique_ptr op = + col_params_.is_source + ? parent_->GetCollectiveBcastSend(col_params_, &tensor_, + DEVICE_CPU, device_) + : parent_->GetCollectiveBcastRecv(col_params_, tensor_.shape(), + DEVICE_CPU, device_); + op_params.op_kernel = op.get(); + OpKernelContext ctx(&op_params, 1); + + Tensor* output_tensor_ptr = nullptr; + if (col_params_.is_source) { + TF_CHECK_OK(ctx.forward_input_or_allocate_output( + {0}, 0, tensor_.shape(), &output_tensor_ptr)); + } else { + TF_CHECK_OK( + ctx.allocate_output(0, tensor_.shape(), &output_tensor_ptr)); + } + CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0)); + + // Prepare a Broadcaster instance. + string exec_key = + strings::StrCat(col_params_.instance.instance_key, ":0:0"); + Broadcaster broadcaster(parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, + &op_params, col_params_, exec_key, kStepId, + output_tensor_ptr); + + // Start execution in a threadpool then wait for completion. + Notification notification; + broadcaster.Run([this, ¬ification](Status s) { + status_ = s; + notification.Notify(); + }); + notification.WaitForNotification(); + if (status_.ok()) { + CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape())); + } + + dev_ctx->Unref(); + } + + BroadcasterTest* parent_; + string dev_name_; + DeviceType device_type_ = DEVICE_CPU; + int rank_; + Tensor tensor_; + Device* device_; + CollectiveParams col_params_; + std::unique_ptr ca_; + std::unique_ptr ctx_; + Status status_; + }; // class DeviceInstance + + bool stop_ = false; + int64 step_id_ = kStepId; + int broadcast_dev_id_ = 0; + DeviceType device_type_; + TestCollectiveExecutorMgr col_exec_mgr_; + CollectiveExecutor* col_exec_ = nullptr; + CollectiveRemoteAccessLocal* rma_; + std::unique_ptr dev_resolver_; + std::vector instances_; + CollectiveParams col_params_; + std::vector gpu_devices_; + std::unique_ptr dev_mgr_; + mutex mu_; + int bcast_recv_counter_ GUARDED_BY(mu_) = 0; + int bcast_send_counter_ GUARDED_BY(mu_) = 0; + int failure_count_ GUARDED_BY(mu_) = 0; +}; + +// Tests of full broadcast algorithm, with different device and +// data types. +// B = data element type +// T = device type +// W = number of workers +// D = number of devices per worker +// L = tensor length +// A = abort after count +#define DEF_TEST(B, T, W, D, L, A) \ + TEST_F(BroadcasterTest, \ + DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \ + DataType dtype = DT_##B; \ + switch (dtype) { \ + case DT_FLOAT: { \ + RunTest(dtype, DEVICE_##T, W, D, L, A); \ + } break; \ + case DT_DOUBLE: { \ + RunTest(dtype, DEVICE_##T, W, D, L, A); \ + } break; \ + case DT_INT32: { \ + RunTest(dtype, DEVICE_##T, W, D, L, A); \ + } break; \ + case DT_INT64: { \ + RunTest(dtype, DEVICE_##T, W, D, L, A); \ + } break; \ + default: \ + LOG(FATAL) << "Unimplemented"; \ + } \ + } + +#ifndef GOOGLE_CUDA +// B T W D L A +DEF_TEST(FLOAT, CPU, 1, 2, 1, 0) +DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0) +DEF_TEST(FLOAT, CPU, 2, 1, 128, 0) +DEF_TEST(FLOAT, CPU, 2, 4, 128, 0) +DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0) +DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0) + +DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0) +DEF_TEST(INT32, CPU, 2, 4, 128, 0) +DEF_TEST(INT64, CPU, 2, 4, 128, 0) + +// Failure cases +DEF_TEST(FLOAT, CPU, 2, 4, 128, 1) +DEF_TEST(FLOAT, CPU, 2, 4, 128, 5) +#endif + +#ifdef GOOGLE_CUDA +// Can only set W=1 for GPU tests. +// B T W D L A +DEF_TEST(FLOAT, GPU, 1, 2, 1, 0) +DEF_TEST(FLOAT, GPU, 1, 2, 33, 0) +DEF_TEST(FLOAT, GPU, 1, 3, 64, 0) +DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0) +DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0) +DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0) + +DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0) +DEF_TEST(INT64, GPU, 1, 8, 1001, 0) + +// Failure cases +DEF_TEST(FLOAT, GPU, 1, 8, 128, 6) +#endif + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc index 393d3f824d4..bdddf927d89 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc @@ -250,6 +250,38 @@ GlobalDeviceMap EstablishGlobalRank( return gdm; } +// Count the devices associated with each task and set +// cp->same_num_devices_per_task. Requires cp->instance.task_names +// be sorted. +void SetDevPerTask(CollectiveParams* cp) { + cp->instance.same_num_devices_per_task = false; + if (cp->instance.task_names.empty()) return; + int dev_per_task = -1; + int count = 0; + const string* last_task_name = &cp->instance.task_names[0]; + for (const string& task_name : cp->instance.task_names) { + if (task_name != *last_task_name) { + CHECK_GT(count, 0); + if (dev_per_task < 0) { + dev_per_task = count; + } else { + CHECK_GT(dev_per_task, 0); + if (count != dev_per_task) return; + } + count = 1; + last_task_name = &task_name; + } else { + ++count; + } + } + CHECK_GT(count, 0); + if ((dev_per_task > 0) && (count != dev_per_task)) { + return; + } + cp->instance.same_num_devices_per_task = true; + CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0); +} + // Sort cp->instance.device_names lexicographically, but do by first // computing a reordering permutation so we can keep cp->instance.task_names // in corresponding order. @@ -278,6 +310,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) { cp->instance.device_names = std::move(new_devs); cp->instance.task_names = std::move(new_tasks); VLOG(1) << "Modified device_names on " << cp; + SetDevPerTask(cp); } // Establish the requested number of subdivision permutations based on the @@ -343,17 +376,18 @@ void GenerateSubdivPerms(const string& device, int source_rank, if (cp->instance.type == BROADCAST_COLLECTIVE) { CHECK_GE(source_rank, 0); - cp->subdiv_source_rank.resize( + cp->instance.impl_details.subdiv_source_rank.resize( cp->instance.impl_details.subdiv_offsets.size(), -1); - for (int sdi = 0; sdi < cp->subdiv_source_rank.size(); ++sdi) { + for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_source_rank.size(); + ++sdi) { for (int j = 0; j < cp->group.group_size; ++j) { if (cp->instance.impl_details.subdiv_permutations[sdi][j] == source_rank) { - cp->subdiv_source_rank[sdi] = j; + cp->instance.impl_details.subdiv_source_rank[sdi] = j; break; } } - CHECK_GE(cp->subdiv_source_rank[sdi], 0); + CHECK_GE(cp->instance.impl_details.subdiv_source_rank[sdi], 0); } } diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc index 4e3c7125f2b..4e33c4779a3 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc @@ -91,9 +91,10 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) { EXPECT_TRUE(cps[i].task.is_local[j]); } EXPECT_EQ(cps[i].subdiv_rank[0], i); - EXPECT_EQ(cps[i].subdiv_source_rank.size(), 0); + EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0); EXPECT_FALSE(cps[i].is_source); EXPECT_EQ(cps[i].default_rank, i); + EXPECT_TRUE(cps[i].instance.same_num_devices_per_task); } } @@ -138,10 +139,11 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) { } ASSERT_GT(cps[i].subdiv_rank.size(), 0); EXPECT_EQ(cps[i].subdiv_rank[0], i); - ASSERT_GT(cps[i].subdiv_source_rank.size(), 0); - EXPECT_EQ(cps[i].subdiv_source_rank[0], 1); + ASSERT_GT(cps[i].instance.impl_details.subdiv_source_rank.size(), 0); + EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank[0], 1); EXPECT_EQ(cps[i].is_source, (i == 1)); EXPECT_EQ(cps[i].default_rank, i); + EXPECT_TRUE(cps[i].instance.same_num_devices_per_task); } } diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h index d25dd5f04ac..716e23bfa16 100644 --- a/tensorflow/core/common_runtime/collective_rma_local.h +++ b/tensorflow/core/common_runtime/collective_rma_local.h @@ -67,6 +67,8 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess { dev_resolver_->ClearTask(task); } + BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; } + // Copy utility that always copies bytes from src to dst even if // they are on the same device, unlike CopyTensor::ViaDMA which will // just change the dst buffer pointer in that case. diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc index a26f2c2f315..d4ac50cbbe6 100644 --- a/tensorflow/core/framework/collective.cc +++ b/tensorflow/core/framework/collective.cc @@ -38,6 +38,7 @@ CollInstanceParams& CollInstanceParams::operator=( device_names.clear(); device_names.assign(other.device_names.begin(), other.device_names.end()); task_names.assign(other.task_names.begin(), other.task_names.end()); + same_num_devices_per_task = other.same_num_devices_per_task; impl_details.subdiv_offsets.assign( other.impl_details.subdiv_offsets.begin(), other.impl_details.subdiv_offsets.end()); @@ -76,6 +77,13 @@ string CollInstanceParams::ToString() const { } strings::StrAppend(&v, "}"); // one subdiv } + if (!impl_details.subdiv_source_rank.empty()) { + strings::StrAppend(&v, " subdiv_source_rank={"); + for (const auto& r : impl_details.subdiv_source_rank) { + strings::StrAppend(&v, r, ","); + } + strings::StrAppend(&v, "}"); + } strings::StrAppend(&v, "}"); // all subdivs return v; } @@ -98,13 +106,6 @@ string CollectiveParams::ToString() const { for (const auto& r : subdiv_rank) { strings::StrAppend(&v, r, ","); } - if (!subdiv_source_rank.empty()) { - strings::StrAppend(&v, " subdiv_rank={"); - for (const auto& r : subdiv_source_rank) { - strings::StrAppend(&v, r, ","); - } - strings::StrAppend(&v, "}"); - } strings::StrAppend(&v, "}}"); return v; } diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h index 5810c7fa547..40d82ab0e97 100644 --- a/tensorflow/core/framework/collective.h +++ b/tensorflow/core/framework/collective.h @@ -79,6 +79,8 @@ struct CollInstanceParams { std::vector device_names; // Task name prefix of corresponding device name. std::vector task_names; + // True if every task has the same number of devices. + bool same_num_devices_per_task; CollImplDetails impl_details; string ToString() const; CollInstanceParams& operator=(const struct CollInstanceParams& other); @@ -102,7 +104,6 @@ struct CollectiveParams { bool is_source; // broadcast only // Rank of this device in each subdivision permutation. std::vector subdiv_rank; - std::vector subdiv_source_rank; std::unique_ptr merge_op; // reduction only std::unique_ptr final_op; // reduction only string ToString() const; @@ -284,12 +285,14 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted { TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor); }; -// Interface of a helper object that provices a CollectiveExecutor with +// Interface of a helper object that provides a CollectiveExecutor with // all of the remote access it needs. class CollectiveRemoteAccess : public PeerAccessInterface, public DeviceResolverInterface { public: virtual ~CollectiveRemoteAccess() {} + + virtual BufRendezvous* buf_rendezvous() = 0; }; // A per-step version of CollectiveRemoteAccess that cleans up outstanding From 55706e693ab20f6200061fb73067cbf27707cccd Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Thu, 19 Apr 2018 13:19:27 -0700 Subject: [PATCH 0451/1734] Support various shapes in TPU DistributionStrategy. PiperOrigin-RevId: 193563912 --- .../distribute/python/minimize_loss_test.py | 11 +--- .../distribute/python/single_loss_example.py | 5 +- .../contrib/distribute/python/tpu_strategy.py | 61 +++++++++++++------ .../contrib/distribute/python/values.py | 33 ++++++++++ 4 files changed, 80 insertions(+), 30 deletions(-) diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index 6c73250dedc..43b2e91cbf1 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -57,25 +57,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) - def tpu_dataset_fn(): - return dataset_fn().batch(2) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). iterator = distribution.distribute_dataset( - tpu_dataset_fn if is_tpu else dataset_fn).make_one_shot_iterator() + dataset_fn).make_one_shot_iterator() def run_step(): - # TODO(isaprykin): Make iterator get_next() return a list of sub- - # batches for each iteration. Pass iterator.get_next() and not iterator - # to call_for_each_tower. return distribution.group( distribution.call_for_each_tower( - model_fn, - iterator.get_next() if not is_tpu else iterator, - run_concurrently=layer.built)) + model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py index 9e8f919c8a9..abd13c6cc69 100644 --- a/tensorflow/contrib/distribute/python/single_loss_example.py +++ b/tensorflow/contrib/distribute/python/single_loss_example.py @@ -54,7 +54,7 @@ def minimize_loss_example(optimizer_fn, """Example of non-distribution-aware legacy code.""" def dataset_fn(): - return dataset_ops.Dataset.from_tensors([[1.]]).repeat() + return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2) # An Optimizer instance is created either outside or inside model_fn. outer_optimizer = None @@ -63,10 +63,11 @@ def minimize_loss_example(optimizer_fn, layer = core.Dense(1, use_bias=use_bias) - def model_fn(x): + def model_fn(xs): """A very simple model written by the user.""" def loss_fn(): + x = math_ops.reduce_mean(xs, keepdims=True) y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) return y * y diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index 804217b5cec..ceb52ceca72 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -23,6 +23,7 @@ from __future__ import print_function from tensorflow.contrib import tpu from tensorflow.contrib.distribute.python import one_device_strategy +from tensorflow.contrib.distribute.python import values from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -33,35 +34,48 @@ from tensorflow.python.ops import control_flow_ops # TODO(isaprykin): Consider whether inheriting is really appropriate. class TPUStrategy(one_device_strategy.OneDeviceStrategy): + """Experimental TPU distribution strategy implementation.""" - def __init__(self, master=None, iterations=None, model_dir=None): + def __init__(self, + global_batch_size=2, + num_cores_per_host=2, + iterations_per_step=2): + # TODO(isaprykin): Generalize the defaults. super(TPUStrategy, self).__init__('/cpu:0') + # TODO(isaprykin): Auto-detect number of cores and hosts. + self._num_cores_per_host = num_cores_per_host + self._global_batch_size = global_batch_size + # TODO(isaprykin): This might have to be per-call. + self._iterations_per_step = iterations_per_step + + def distribute_dataset(self, dataset_fn): + return values.PerIterationDataset( + self._call_dataset_fn(dataset_fn), self._iterations_per_step) def _call_for_each_tower(self, fn, *args, **kwargs): kwargs.pop('run_concurrently', None) - # TODO(isaprykin): Give an API for many iterations per step. - iterations = 1 + # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup. + inputs = args[0] - # TODO(isaprykin): Do not hard code shapes and input format :) - # TODO(isaprykin): Detect the number of TPU cores automatically. - - def dequeueing_fn(*args, **kwargs): - del args, kwargs - x, = tpu.infeed_dequeue_tuple(dtypes=[dtypes.float32], shapes=[[1, 1, 1]]) - return fn(x) - - iterator = args[0] + sharded_shape = [None] # Python 2 nonlocal. def infeed_input(i): """Get input, split it and then enqueue.""" - batches = iterator.get_next() - batches = array_ops.split(batches, 2) + batches = array_ops.gather(inputs, i) + + # TODO(isaprykin): Handle partial batch. + global_shape = [self._global_batch_size] + list(batches.get_shape())[1:] + sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] + + list(global_shape)[1:]) + + batches.set_shape(global_shape) + batches = array_ops.split(batches, self._num_cores_per_host) infeeds = [ tpu_ops.infeed_enqueue_tuple( - inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j) - for j in range(2) + inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j) + for j in range(self._num_cores_per_host) ] with ops.control_dependencies(infeeds): @@ -69,14 +83,23 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): with ops.device('/task:0/device:CPU:0'): enqueue_ops = control_flow_ops.while_loop( - lambda i: i < iterations, + lambda i: i < self._iterations_per_step, infeed_input, [constant_op.constant(0)], parallel_iterations=1) + assert sharded_shape[0] + + def dequeueing_fn(*args, **kwargs): + del args, kwargs + x, = tpu.infeed_dequeue_tuple( + dtypes=[dtypes.float32], shapes=[sharded_shape[0]]) + return fn(x) + def iterate_on_tpu(): - return tpu.repeat(iterations, dequeueing_fn, []) + return tpu.repeat(self._iterations_per_step, dequeueing_fn, []) with one_device_strategy._OneDeviceTowerContext(self): # pylint: disable=protected-access - tpu_result = tpu.batch_parallel(iterate_on_tpu, [], num_shards=2) + tpu_result = tpu.batch_parallel( + iterate_on_tpu, [], num_shards=self._num_cores_per_host) return control_flow_ops.group(tpu_result, enqueue_ops) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 18fedd27751..62016c3a789 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -570,6 +570,39 @@ class PerDeviceDataset(object): dataset_iterator, self._devices, self._prefetch_on_device) +class MultiIterator(object): + """Iterator that returns results of multiple get_next()s.""" + + def __init__(self, dataset_iterator, iterations): + self._dataset_iterator = dataset_iterator + self._iterations = iterations + + def get_next(self, name=None): + return [ + self._dataset_iterator.get_next(name=name) + for _ in range(self._iterations) + ] + + @property + def initializer(self): + return self._dataset_iterator.initializer + + +class PerIterationDataset(object): + + def __init__(self, dataset, iterations): + self._dataset = dataset + self._iterations = iterations + + def make_one_shot_iterator(self): + iterator = self._dataset.make_one_shot_iterator() + return MultiIterator(iterator, self._iterations) + + def make_initializable_iterator(self): + iterator = self._dataset.make_initializable_iterator() + return MultiIterator(iterator, self._iterations) + + class MapOutput(object): """Map can result in multiple outputs per device.""" From 7f1e64eb94447665047fac16c67b5351bcf3c8a3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 13:21:25 -0700 Subject: [PATCH 0452/1734] Allow output has a different shape from input in the image.transform (#17011). PiperOrigin-RevId: 193564222 --- tensorflow/contrib/image/kernels/image_ops.cc | 7 ++- tensorflow/contrib/image/kernels/image_ops.h | 2 +- tensorflow/contrib/image/ops/image_ops.cc | 52 +++++++++++++++++-- .../python/kernel_tests/image_ops_test.py | 30 +++++++++++ .../contrib/image/python/ops/image_ops.py | 39 ++++++++------ 5 files changed, 107 insertions(+), 23 deletions(-) diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc index c2e32da133b..ae4b1ba62a8 100644 --- a/tensorflow/contrib/image/kernels/image_ops.cc +++ b/tensorflow/contrib/image/kernels/image_ops.cc @@ -70,6 +70,7 @@ class ImageProjectiveTransform : public OpKernel { void Compute(OpKernelContext* ctx) override { const Tensor& images_t = ctx->input(0); const Tensor& transform_t = ctx->input(1); + const Tensor& output_dim = ctx->input(2); OP_REQUIRES(ctx, images_t.shape().dims() == 4, errors::InvalidArgument("Input images must have rank 4")); OP_REQUIRES(ctx, @@ -83,7 +84,11 @@ class ImageProjectiveTransform : public OpKernel { auto images = images_t.tensor(); auto transform = transform_t.matrix(); Tensor* output_t; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t)); + // Image is NHWC format. + auto output_shape = images_t.shape(); + output_shape.set_dim(1, output_dim.vec()(0)); + output_shape.set_dim(2, output_dim.vec()(1)); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t)); auto output = output_t->tensor(); (FillProjectiveTransform(interpolation_))( ctx->eigen_device(), &output, images, transform); diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h index ad501330617..2320329b923 100644 --- a/tensorflow/contrib/image/kernels/image_ops.h +++ b/tensorflow/contrib/image/kernels/image_ops.h @@ -161,7 +161,7 @@ struct FillProjectiveTransform { void operator()(const Device& device, OutputType* output, const InputType& images, const TransformsType& transform) const { - output->device(device) = images.generate( + output->device(device) = output->generate( ProjectiveGenerator(images, transform, interpolation_)); } }; diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc index 68771b3d054..4c6d8c0d192 100644 --- a/tensorflow/contrib/image/ops/image_ops.cc +++ b/tensorflow/contrib/image/ops/image_ops.cc @@ -19,9 +19,55 @@ limitations under the License. namespace tensorflow { +using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; +namespace { + +// Sets output[0] to shape [batch_dim,height,width,channel_dim], where +// height and width come from the size_tensor. +Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim, + int size_input_idx, DimensionHandle channel_dim) { + // Verify shape of size input. + ShapeHandle size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused)); + + // Get size values from the size tensor. + const Tensor* size_tensor = c->input_tensor(size_input_idx); + DimensionHandle width; + DimensionHandle height; + if (size_tensor == nullptr) { + width = c->UnknownDim(); + height = c->UnknownDim(); + } else { + // TODO(petewarden) - Remove once we have constant evaluation in C++ only. + if (size_tensor->dtype() != DT_INT32) { + return errors::InvalidArgument( + "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 " + "but got ", + DataTypeString(size_tensor->dtype()), " for input #", size_input_idx, + " in ", c->DebugString()); + } + auto vec = size_tensor->vec(); + height = c->MakeDim(vec(0)); + width = c->MakeDim(vec(1)); + } + c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim})); + return Status::OK(); +} + +Status ResizeShapeFn(InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */, + c->Dim(input, 3)); +} + +} // namespace + // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc. // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0). // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to @@ -29,13 +75,11 @@ using shape_inference::ShapeHandle; REGISTER_OP("ImageProjectiveTransform") .Input("images: dtype") .Input("transforms: float32") + .Input("output_shape: int32") .Attr("dtype: {uint8, int32, int64, float32, float64}") .Attr("interpolation: string") .Output("transformed_images: dtype") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); - }) + .SetShapeFn(ResizeShapeFn) .Doc(R"doc( Applies the given transform to each of the images. diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index b50177ae565..c0151d320f9 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -195,10 +195,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase): x_init_value=test_image) self.assertLess(left_err, 1e-10) + def _test_grad_different_shape(self, input_shape, output_shape): + with self.test_session(): + test_image_shape = input_shape + test_image = np.random.randn(*test_image_shape) + test_image_tensor = constant_op.constant( + test_image, shape=test_image_shape) + test_transform = image_ops.angles_to_projective_transforms( + np.pi / 2, 4, 4) + + if len(output_shape) == 2: + resize_shape = output_shape + elif len(output_shape) == 3: + resize_shape = output_shape[0:2] + elif len(output_shape) == 4: + resize_shape = output_shape[1:3] + output = image_ops.transform( + images=test_image_tensor, + transforms=test_transform, + output_shape=resize_shape) + left_err = gradient_checker.compute_gradient_error( + test_image_tensor, + test_image_shape, + output, + output_shape, + x_init_value=test_image) + self.assertLess(left_err, 1e-10) + def test_grad(self): self._test_grad([16, 16]) self._test_grad([4, 12, 12]) self._test_grad([3, 4, 12, 12]) + self._test_grad_different_shape([16, 16], [8, 8]) + self._test_grad_different_shape([4, 12, 3], [8, 24, 3]) + self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3]) class BipartiteMatchTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index c139ae89d8d..0cb7bdc75dd 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -212,7 +212,11 @@ def translations_to_projective_transforms(translations, name=None): axis=1) -def transform(images, transforms, interpolation="NEAREST", name=None): +def transform(images, + transforms, + output_shape=None, + interpolation="NEAREST", + name=None): """Applies the given transform(s) to the image(s). Args: @@ -228,7 +232,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None): where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the transform mapping input points to output points. Note that gradients are not backpropagated into transformation parameters. + output_shape: Output dimesion after the transform, [height, width]. + If None, output is the same size as input image. interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR". + name: The name of the op. Returns: Image(s) with the same type and shape as `images`, with the given @@ -255,6 +262,14 @@ def transform(images, transforms, interpolation="NEAREST", name=None): else: raise TypeError("Images should have rank between 2 and 4.") + if output_shape is None: + output_shape = images.get_shape()[1:3] + elif len(output_shape) != 2: + raise TypeError( + "output_shape must either be None or a vector of 2 elements.") + output_shape = ops.convert_to_tensor( + output_shape, name="output_shape", dtype=dtypes.int32) + if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif transform_or_transforms.get_shape().ndims is None: @@ -265,7 +280,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None): else: raise TypeError("Transforms should have rank 1 or 2.") output = gen_image_ops.image_projective_transform( - images, transforms, interpolation=interpolation.upper()) + images, transforms, output_shape, interpolation=interpolation.upper()) if len(image_or_images.get_shape()) == 2: return output[0, :, :, 0] elif len(image_or_images.get_shape()) == 3: @@ -375,14 +390,6 @@ def _image_projective_transform_grad(op, grad): if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES: raise TypeError("Invalid dtype %s." % image_or_images.dtype) - if len(image_or_images.get_shape()) == 2: - images = image_or_images[None, :, :, None] - elif len(image_or_images.get_shape()) == 3: - images = image_or_images[None, :, :, :] - elif len(image_or_images.get_shape()) == 4: - images = image_or_images - else: - raise TypeError("Images should have rank between 2 and 4") if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif len(transform_or_transforms.get_shape()) == 2: @@ -395,13 +402,11 @@ def _image_projective_transform_grad(op, grad): inverse = linalg_ops.matrix_inverse(transforms) transforms = matrices_to_flat_transforms(inverse) output = gen_image_ops.image_projective_transform( - grad, transforms, interpolation=interpolation) - if len(image_or_images.get_shape()) == 2: - return [output[0, :, :, 0], None] - elif len(image_or_images.get_shape()) == 3: - return [output[0, :, :, :], None] - else: - return [output, None] + images=grad, + transforms=transforms, + output_shape=image_or_images.get_shape()[1:3], + interpolation=interpolation) + return [output, None, None] def bipartite_match(distance_mat, From ab47eb8d9bcac55fd19b0e862cf9a2a7de195787 Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Thu, 19 Apr 2018 13:38:43 -0700 Subject: [PATCH 0453/1734] tools/lib_package: Fix typo in README PiperOrigin-RevId: 193566850 --- tensorflow/tools/lib_package/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/tools/lib_package/README.md b/tensorflow/tools/lib_package/README.md index 70081482603..cb6aef26245 100644 --- a/tensorflow/tools/lib_package/README.md +++ b/tensorflow/tools/lib_package/README.md @@ -35,8 +35,8 @@ The following commands: bazel test --config opt //tensorflow/tools/lib_package:libtensorflow_test bazel build --config opt \ //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \ - //tensorflow/tools/lib_package:libtensorflow.jar \ - //tensorflow/tools/lib_package:libtensorflow-src.jar + //tensorflow/java:libtensorflow.jar \ + //tensorflow/java:libtensorflow-src.jar ``` test and produce the following: @@ -44,9 +44,9 @@ test and produce the following: - The native library (`libtensorflow_jni.so`) packaged in an archive at: `bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz` - The Java archive at: - `bazel-bin/tensorflow/tools/lib_package/libtensorflow.jar` + `bazel-bin/tensorflow/java/libtensorflow.jar` - The Java archive for Java sources at: - `bazel-bin/tensorflow/tools/lib_package/libtensorflow-src.jar` + `bazel-bin/tensorflow/java/libtensorflow-src.jar` ## Release From 459d61cbe8ab9cbb86b2bb7eac602ff565d54fde Mon Sep 17 00:00:00 2001 From: Jie Date: Thu, 19 Apr 2018 13:48:14 -0700 Subject: [PATCH 0454/1734] [PR comment addressed] switched from std::string to TF string custom_plugin_examples python test added (bazel) style guide violation addressed --- .../contrib/tensorrt/convert/convert_nodes.cc | 22 ++--- .../tensorrt/custom_plugin_examples/BUILD | 42 ++++++--- .../custom_plugin_examples/__init__.py | 12 +-- .../inc_op_kernel.cu.cc | 2 - .../custom_plugin_examples/inc_op_kernel.h | 3 +- .../{inc_op_plugin.cc => inc_op_plugin.cu.cc} | 9 +- .../custom_plugin_examples/inc_op_plugin.h | 18 ++-- .../custom_plugin_examples/ops/inc_op.cc | 4 +- .../{test => }/plugin_test.py | 46 +++++----- tensorflow/contrib/tensorrt/log/trt_logger.h | 2 +- .../contrib/tensorrt/plugin/trt_plugin.cc | 3 +- .../contrib/tensorrt/plugin/trt_plugin.h | 14 +-- .../tensorrt/plugin/trt_plugin_factory.cc | 7 +- .../tensorrt/plugin/trt_plugin_factory.h | 8 +- .../tensorrt/plugin/trt_plugin_utils.cc | 2 +- .../tensorrt/plugin/trt_plugins_test.cc | 19 ++-- tensorflow/contrib/tensorrt/plugin_test.py | 88 +++++++++++++++++++ .../tensorrt/resources/trt_resources.h | 2 +- 18 files changed, 205 insertions(+), 98 deletions(-) rename tensorflow/contrib/tensorrt/custom_plugin_examples/{inc_op_plugin.cc => inc_op_plugin.cu.cc} (91%) rename tensorflow/contrib/tensorrt/custom_plugin_examples/{test => }/plugin_test.py (67%) create mode 100644 tensorflow/contrib/tensorrt/plugin_test.py diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 874be96c781..c8a96e5dba8 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -241,9 +241,9 @@ class TFAttrs { return attrs_.at(key); } template - T get(string key) const; + T get(const string& key) const; template - T get(string key, const T& default_value) const { + T get(const string& key, const T& default_value) const { return attrs_.count(key) ? this->get(key) : default_value; } @@ -261,29 +261,29 @@ class TFAttrs { }; template <> -string TFAttrs::get(string key) const { +string TFAttrs::get(const string& key) const { return this->at(key)->s(); } template <> -std::vector TFAttrs::get>(string key) const { +std::vector TFAttrs::get>(const string& key) const { auto attr = this->at(key)->list().i(); return std::vector(attr.begin(), attr.end()); } template <> -std::vector TFAttrs::get>(string key) const { +std::vector TFAttrs::get>(const string& key) const { auto attr = this->at(key)->list().f(); return std::vector(attr.begin(), attr.end()); } template <> -std::vector TFAttrs::get>(string key) const { +std::vector TFAttrs::get>(const string& key) const { auto attr = this->at(key)->list().s(); return std::vector(attr.begin(), attr.end()); } template <> -nvinfer1::Dims TFAttrs::get(string key) const { +nvinfer1::Dims TFAttrs::get(const string& key) const { auto values = this->get>(key); nvinfer1::Dims dims; dims.nbDims = values.size(); @@ -293,24 +293,24 @@ nvinfer1::Dims TFAttrs::get(string key) const { } template <> -nvinfer1::DataType TFAttrs::get(string key) const { +nvinfer1::DataType TFAttrs::get(const string& key) const { nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT); TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype)); return trt_dtype; } template <> -tensorflow::DataType TFAttrs::get(string key) const { +tensorflow::DataType TFAttrs::get(const string& key) const { return this->at(key)->type(); } template <> -float TFAttrs::get(string key) const { +float TFAttrs::get(const string& key) const { return this->at(key)->f(); } template <> -bool TFAttrs::get(string key) const { +bool TFAttrs::get(const string& key) const { return this->at(key)->b(); } diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD index 5603ed0ccf5..3b1a7fb6f33 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD @@ -1,3 +1,9 @@ +# Description: +# Example for plugin support in TensorRT(http://developer.nvidia.com/tensorrt) +# through TensorFlow integration. Targeting TensorRT 3.0.4 +# APIs are meant to change while upgrading TRT. +# add init_py into pip package BUILD dependency to install it. + package(default_visibility = ["//tensorflow:__subpackages__"]) load( @@ -8,6 +14,7 @@ load( "tf_gen_op_wrapper_py", "tf_py_wrap_cc", "tf_copts", + "tf_py_test", ) load( "@local_config_tensorrt//:build_defs.bzl", @@ -18,19 +25,16 @@ load("//tensorflow:tensorflow.bzl", "tf_kernel_library") tf_kernel_library( name = "_inc_op_plugin_kernel", - srcs = [ - "inc_op_plugin.cc", - ], - hdrs = [ - ], gpu_srcs = [ "inc_op_kernel.cu.cc", "inc_op_kernel.h", + "inc_op_plugin.cu.cc", "inc_op_plugin.h", ], - deps = if_tensorrt([ - "@local_config_tensorrt//:nv_infer", + deps = [ "//tensorflow/contrib/tensorrt:trt_plugins", + ] + if_tensorrt([ + "@local_config_tensorrt//:nv_infer", ]), ) @@ -38,9 +42,10 @@ tf_gen_op_libs( op_lib_names = [ "inc_op", ], - deps = if_tensorrt([ - "@local_config_tensorrt//:nv_infer", + deps = [ "//tensorflow/contrib/tensorrt:trt_plugins", + ] + if_tensorrt([ + "@local_config_tensorrt//:nv_infer", ]), ) @@ -70,9 +75,8 @@ tf_custom_op_library( srcs = ["ops/inc_op.cc"], deps = [ "//tensorflow/core:lib_proto_parsing", - ] + if_tensorrt([ "//tensorflow/contrib/tensorrt:trt_plugins", - ]), + ], ) tf_custom_op_py_library( @@ -97,6 +101,22 @@ py_library( ], ) +tf_py_test( + name = "plugin_test", + size = "small", + srcs = [ + "plugin_test.py", + ], + additional_deps = [ + ":init_py", + "//tensorflow/contrib/util:util_py", + "//tensorflow/contrib/tensorrt:init_py", + "//tensorflow/python:platform", + "//tensorflow/python:client_testlib", + "//tensorflow/python:tf_optimizer", + ], +) + py_library( name = "init_py", srcs = [ diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py index a61d0089418..e4cd0ae8a05 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py @@ -14,11 +14,13 @@ # ============================================================================= """Import custom op for plugin and register it in plugin factory registry.""" -from ops import gen_inc_op -from plugin_wrap import inc_op_register -from inc_op import * +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.tensorrt.custom_plugin_examples.ops import gen_inc_op +from tensorflow.contrib.tensorrt.custom_plugin_examples.plugin_wrap import inc_op_register +from tensorflow.contrib.tensorrt.custom_plugin_examples import inc_op as import_inc_op_so -# pylint: disable=unused-import,wildcard-import,g-import-not-at-top inc_op = gen_inc_op.inc_plugin_trt inc_op_register() -# pylint: enable=unused-import,wildcard-import,g-import-not-at-top diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc index 5dd6b9bf949..38e1e01d954 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc @@ -14,10 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h" -#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h" #if GOOGLE_CUDA -#define EIGEN_USE_GPU #if GOOGLE_TENSORRT namespace tensorflow { diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h index ec269143e89..13156dad8fd 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h @@ -17,13 +17,14 @@ limitations under the License. #define TENSORFLOW_CONTRIB_TENSORRT_INC_OP #if GOOGLE_CUDA -#define EIGEN_USE_GPU #if GOOGLE_TENSORRT namespace tensorflow { namespace tensorrt { __global__ void VecInc(float* vec, float inc, float* dest, int n); +void IncrementKernel(const float* d_input, float inc, float* d_output, + int count, cudaStream_t stream); } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cu.cc similarity index 91% rename from tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc rename to tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cu.cc index 21617fa8b59..508ced587bd 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cu.cc @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h" +#include +#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h" #include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #if GOOGLE_CUDA @@ -23,7 +24,7 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -const std::string IncOpPlugin::plugin_name_ = "IncPluginTRT"; +const string IncOpPlugin::plugin_name_ = "IncPluginTRT"; IncOpPlugin* CreateIncPlugin() { return new IncOpPlugin(); } @@ -47,7 +48,7 @@ IncOpPlugin::IncOpPlugin(const void* serialized_data, size_t length) SetAttribute("inc", buffer + consumed_data, sizeof(float)); } -bool IncOpPlugin::SetAttribute(const std::string& key, const void* ptr, +bool IncOpPlugin::SetAttribute(const string& key, const void* ptr, const size_t size) { if (strcmp(key.c_str(), "inc") == 0 && size == sizeof(float)) { StoreAttribute(key, ptr, size); // save the attribute to own the data; @@ -57,7 +58,7 @@ bool IncOpPlugin::SetAttribute(const std::string& key, const void* ptr, return false; } -bool IncOpPlugin::GetAttribute(const std::string& key, const void** ptr, +bool IncOpPlugin::GetAttribute(const string& key, const void** ptr, size_t* size) const { const auto& iter = attr_map_.find(key); if (iter != attr_map_.end()) { diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h index a4774d354ca..87404a755c2 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h @@ -18,10 +18,6 @@ limitations under the License. #include #include -#include -#include -#include -#include #include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h" #if GOOGLE_CUDA @@ -33,14 +29,14 @@ namespace tensorrt { class IncOpPlugin : public PluginTensorRT { public: - static const std::string plugin_name_; - IncOpPlugin(){}; + static const string plugin_name_; + IncOpPlugin() {}; IncOpPlugin(const void* serialized_data, size_t length); - const std::string& GetPluginName() const override { return plugin_name_; }; + const string& GetPluginName() const override { return plugin_name_; }; bool Finalize() override { return true; }; - bool SetAttribute(const std::string& key, const void* ptr, + bool SetAttribute(const string& key, const void* ptr, const size_t size) override; - bool GetAttribute(const std::string& key, const void** ptr, + bool GetAttribute(const string& key, const void** ptr, size_t* size) const override; int getNbOutputs() const override { return 1; } @@ -56,7 +52,7 @@ class IncOpPlugin : public PluginTensorRT { void configure(const nvinfer1::Dims* inputs, int num_inputs, const nvinfer1::Dims* outputs, int num_outputs, int max_batch_size) override { - assert(nb_inputs == 1); + assert(num_inputs == 1); PluginTensorRT::configure(inputs, num_inputs, outputs, num_outputs, max_batch_size); } @@ -95,8 +91,6 @@ class IncOpPlugin : public PluginTensorRT { IncOpPlugin* CreateIncPlugin(); IncOpPlugin* CreateIncPluginDeserialize(const void*, size_t); bool RegisterIncOpPlugin(); -void IncrementKernel(const float* d_input, float inc, float* d_output, - int count, cudaStream_t stream); } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc index 0dfead8f57a..7466e590901 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc @@ -19,7 +19,7 @@ limitations under the License. #if GOOGLE_CUDA #if GOOGLE_TENSORRT -using namespace tensorflow; +namespace tensorflow { REGISTER_OP("IncPluginTRT") .Attr("inc: list(float)") @@ -30,5 +30,7 @@ REGISTER_OP("IncPluginTRT") return Status::OK(); }); +} // namespace tensorflow + #endif // GOOGLE_CUDA #endif // GOOGLE_TENSORRT diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/test/plugin_test.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py similarity index 67% rename from tensorflow/contrib/tensorrt/custom_plugin_examples/test/plugin_test.py rename to tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py index 52f49ae00e8..9f773c66a99 100644 --- a/tensorflow/contrib/tensorrt/custom_plugin_examples/test/plugin_test.py +++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py @@ -23,43 +23,44 @@ from __future__ import print_function # it looks like internal builds don't like it so # importing every module individually -from tensorflow.contrib import tensorrt as trt -from tensorflow.core.protobuf import config_pb2 as cpb2 -from tensorflow.python.client import session as csess -from tensorflow.python.framework import dtypes as dtypes -from tensorflow.python.framework import importer as importer -from tensorflow.python.framework import ops as ops -from tensorflow.python.ops import array_ops as aops -from tensorflow.python.ops import nn as nn -from tensorflow.python.ops import nn_ops as nn_ops -import numpy as np +from tensorflow.contrib import tensorrt +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import importer +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import nn_ops +from tensorflow.python.framework import errors +import numpy # import custom_op as plugin op -# the python api handles registration to the plugin factory -from tensorflow.contrib.tensorrt import custom_plugin_examples as cpe +# the python api handles registration to the plugin factory +from tensorflow.contrib.tensorrt import custom_plugin_examples def get_plugin_graph_def(): """Create a simple graph and return its graph_def.""" g = ops.Graph() with g.as_default(): - a = aops.placeholder( + a = array_ops.placeholder( dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input") relu = nn.relu(a, "relu") v = nn_ops.max_pool( relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool") # insert custom_op in the graph - v = cpe.inc_op(v, inc=[16.5], name="plugin_test") + v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test") v = v*2.0 v = nn.relu(v) v = nn.relu(v) - aops.squeeze(v, name="output") + array_ops.squeeze(v, name="output") return g.as_graph_def() def run_graph(gdef, dumm_inp): """Run given graphdef once.""" - gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) + gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50) ops.reset_default_graph() g = ops.Graph() with g.as_default(): @@ -68,20 +69,20 @@ def run_graph(gdef, dumm_inp): inp = inp.outputs[0] out = out.outputs[0] - with csess.Session( - config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess: + with session.Session( + config=config_pb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess: val = sess.run(out, {inp: dumm_inp}) return val if "__main__" in __name__: inp_dims = (5, 24, 24, 2) - dummy_input = np.ones(inp_dims).astype(np.float32) + dummy_input = numpy.ones(inp_dims).astype(numpy.float32) orig_graph = get_plugin_graph_def() # graph with plugin node # trigger conversion. # plugin nodes have been registered during import, converter will be able to # create corresponding plugin layer during conversion. - trt_graph = trt.create_inference_graph( + trt_graph = tensorrt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], @@ -90,4 +91,7 @@ if "__main__" in __name__: minimum_segment_size=2 ) o2 = run_graph(trt_graph, dummy_input) - print (o2) + if o2.reshape([-1])[0] == 35: + print("pass") + else: + raise RuntimeError("contrib/tensorrt/custom_plugin_examples wrong result") diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h index 7f3544f8cfd..3495dc63185 100644 --- a/tensorflow/contrib/tensorrt/log/trt_logger.h +++ b/tensorflow/contrib/tensorrt/log/trt_logger.h @@ -28,7 +28,7 @@ namespace tensorrt { // Logger for GIE info/warning/errors class Logger : public nvinfer1::ILogger { public: - Logger(string name = "DefaultLogger") : name_(name){}; + Logger(string name = "DefaultLogger") : name_(name) {}; void log(nvinfer1::ILogger::Severity severity, const char* msg) override; private: diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc index 82c549dbf50..062f86e8bb4 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc @@ -25,7 +25,6 @@ namespace tensorflow { namespace tensorrt { PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) { - // sanity check. const char* buffer = static_cast(serialized_data); size_t op_name_char_count = *reinterpret_cast(buffer); buffer += sizeof(size_t); @@ -91,7 +90,7 @@ void PluginTensorRT::serialize(void* serialized_data) { } } -bool PluginTensorRT::StoreAttribute(const std::string& key, const void* ptr, +bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr, const size_t size) { if (attr_map_.count(key) != 0) return false; diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h index 772974a769b..dca377c2d2b 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h @@ -17,9 +17,9 @@ limitations under the License. #define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN #include -#include #include #include +#include "tensorflow/core/platform/types.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT @@ -35,28 +35,28 @@ namespace tensorrt { // PluginDeserializeFunc & PluginConstructFunc through PluginFactoryTensorRT class PluginTensorRT : public nvinfer1::IPlugin { public: - PluginTensorRT(){}; + PluginTensorRT() {}; PluginTensorRT(const void* serialized_data, size_t length); - virtual const std::string& GetPluginName() const = 0; + virtual const string& GetPluginName() const = 0; virtual bool Finalize() = 0; - virtual bool SetAttribute(const std::string& key, const void* ptr, + virtual bool SetAttribute(const string& key, const void* ptr, const size_t size) = 0; - virtual bool GetAttribute(const std::string& key, const void** ptr, + virtual bool GetAttribute(const string& key, const void** ptr, size_t* size) const = 0; void configure(const nvinfer1::Dims* inputs, int num_inputs, const nvinfer1::Dims* outputs, int num_outputs, int max_batch_size) override; - virtual bool StoreAttribute(const std::string& key, const void* ptr, + virtual bool StoreAttribute(const string& key, const void* ptr, const size_t size); virtual size_t getSerializationSize() override; virtual void serialize(void* buffer) override; protected: - std::unordered_map > attr_map_; + std::unordered_map > attr_map_; std::vector input_dim_list_; }; diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc index 776bce119df..736a1321fe7 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc @@ -26,7 +26,7 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, size_t serial_length) { size_t parsed_byte = 0; // extract op_name from serial_data - std::string encoded_op_name = + string encoded_op_name = ExtractOpName(serial_data, serial_length, &parsed_byte); if (!IsPlugin(encoded_op_name)) { @@ -41,8 +41,7 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, return plugin_ptr; } -PluginTensorRT* PluginFactoryTensorRT::CreatePlugin( - const std::string& op_name) { +PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) { if (!IsPlugin(op_name)) return nullptr; std::lock_guard lock(instance_m_); @@ -53,7 +52,7 @@ PluginTensorRT* PluginFactoryTensorRT::CreatePlugin( } bool PluginFactoryTensorRT::RegisterPlugin( - const std::string& op_name, PluginDeserializeFunc deserialize_func, + const string& op_name, PluginDeserializeFunc deserialize_func, PluginConstructFunc construct_func) { if (IsPlugin(op_name)) return false; diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h index 08fd3768445..4e4a3af4cab 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h @@ -36,7 +36,7 @@ class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { size_t serial_length) override; // plugin construction, PluginFactoryTensorRT owns the plugin; - PluginTensorRT* CreatePlugin(const std::string& op_name); + PluginTensorRT* CreatePlugin(const string& op_name); static PluginFactoryTensorRT* GetInstance() { static PluginFactoryTensorRT* factory_instance = @@ -44,11 +44,11 @@ class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { return factory_instance; } - bool RegisterPlugin(const std::string& op_name, + bool RegisterPlugin(const string& op_name, PluginDeserializeFunc deserialize_func, PluginConstructFunc construct_func); - bool IsPlugin(const std::string& op_name) { + bool IsPlugin(const string& op_name) { return plugin_registry_.find(op_name) != plugin_registry_.end(); } @@ -57,7 +57,7 @@ class PluginFactoryTensorRT : public nvinfer1::IPluginFactory { void DestroyPlugins(); protected: - std::unordered_map > plugin_registry_; diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc index c5d3f38280e..a8f60886c03 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc @@ -30,7 +30,7 @@ string ExtractOpName(const void* serial_data, size_t serial_length, assert(serial_length >= *incremental); const char* buffer = static_cast(serial_data) + sizeof(size_t); - std::string op_name(buffer, op_name_char_count); + string op_name(buffer, op_name_char_count); return op_name; } diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugins_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugins_test.cc index 9ef0fce972a..b834c5511f9 100644 --- a/tensorflow/contrib/tensorrt/plugin/trt_plugins_test.cc +++ b/tensorflow/contrib/tensorrt/plugin/trt_plugins_test.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/test.h" @@ -31,18 +30,17 @@ namespace test { class StubPlugin : public PluginTensorRT { public: - static const std::string plugin_name_; - StubPlugin(){}; + static const string plugin_name_; + StubPlugin() {}; StubPlugin(const void* serialized_data, size_t length) - : PluginTensorRT(serialized_data, length){}; - const std::string& GetPluginName() override { return plugin_name_; }; + : PluginTensorRT(serialized_data, length) {}; + const string& GetPluginName() override { return plugin_name_; }; virtual bool Finalize() { return true; }; - virtual bool SetAttribute(const std::string& key, const void* ptr, + virtual bool SetAttribute(const string& key, const void* ptr, const size_t size) { return true; }; - virtual bool GetAttribute(const std::string& key, const void* ptr, - size_t& size) { + virtual bool GetAttribute(const string& key, const void* ptr, size_t& size) { return true; }; int getNbOutputs() const override { return 1; } @@ -59,7 +57,7 @@ class StubPlugin : public PluginTensorRT { } }; -const std::string StubPlugin::plugin_name_ = "StubPlugin"; +const string StubPlugin::plugin_name_ = "StubPlugin"; StubPlugin* CreateStubPlugin() { return new StubPlugin(); } @@ -72,8 +70,9 @@ class PluginTest : public ::testing::Test { public: bool RegisterStubPlugin() { if (PluginFactoryTensorRT::GetInstance()->IsPlugin( - StubPlugin::plugin_name_)) + StubPlugin::plugin_name_)) { return true; + } return PluginFactoryTensorRT::GetInstance()->RegisterPlugin( StubPlugin::plugin_name_, CreateStubPluginDeserialize, CreateStubPlugin); diff --git a/tensorflow/contrib/tensorrt/plugin_test.py b/tensorflow/contrib/tensorrt/plugin_test.py new file mode 100644 index 00000000000..7c3e765bff4 --- /dev/null +++ b/tensorflow/contrib/tensorrt/plugin_test.py @@ -0,0 +1,88 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Script to show usage of TensorRT custom op & plugin.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib import tensorrt +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import importer +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import nn_ops +import numpy as np + +# import custom_op as plugin op +# the python api handles registration to the plugin factory +from tensorflow.contrib.tensorrt import custom_plugin_examples + +def get_plugin_graph_def(): + """Create a simple graph and return its graph_def.""" + g = ops.Graph() + with g.as_default(): + a = array_ops.placeholder( + dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input") + relu = nn.relu(a, "relu") + v = nn_ops.max_pool( + relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool") + + # insert custom_op in the graph + v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test") + + v = v*2.0 + v = nn.relu(v) + v = nn.relu(v) + array_ops.squeeze(v, name="output") + return g.as_graph_def() + +def run_graph(gdef, dumm_inp): + """Run given graphdef once.""" + gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50) + ops.reset_default_graph() + g = ops.Graph() + with g.as_default(): + inp, out = importer.import_graph_def( + graph_def=gdef, return_elements=["input", "output"]) + inp = inp.outputs[0] + out = out.outputs[0] + + with session.Session( + config=config_pb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess: + val = sess.run(out, {inp: dumm_inp}) + return val + +if "__main__" in __name__: + inp_dims = (5, 24, 24, 2) + dummy_input = np.ones(inp_dims).astype(np.float32) + orig_graph = get_plugin_graph_def() # graph with plugin node + + # trigger conversion. + # plugin nodes have been registered during import, converter will be able to + # create corresponding plugin layer during conversion. + trt_graph = tensorrt.create_inference_graph( + input_graph_def=orig_graph, + outputs=["output"], + max_batch_size=inp_dims[0], + max_workspace_size_bytes=1 << 25, + precision_mode="FP32", + minimum_segment_size=2 + ) + o2 = run_graph(trt_graph, dummy_input) + print (o2) diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h index 3c85968ae7a..5164247f938 100644 --- a/tensorflow/contrib/tensorrt/resources/trt_resources.h +++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h @@ -82,7 +82,7 @@ class TRTWeightStore : public tensorflow::ResourceBase { class TRTEngineResource : public tensorflow::ResourceBase { public: - TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){}; + TRTEngineResource() : runtime_(nullptr), ctx_(nullptr) {}; string DebugString() override { return string(""); } nvinfer1::IRuntime* runtime_; nvinfer1::IExecutionContext* ctx_; From 1e7289fc0e64a706bb1867cfe5a8c5f5d2f7150f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 14:05:06 -0700 Subject: [PATCH 0455/1734] Make flat_transforms_to_matrices and matrices_to_flat_transforms public available. PiperOrigin-RevId: 193571089 --- tensorflow/contrib/image/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py index e982030bc89..8f406ace1d5 100755 --- a/tensorflow/contrib/image/__init__.py +++ b/tensorflow/contrib/image/__init__.py @@ -25,6 +25,8 @@ projective transforms (including rotation) are supported. @@angles_to_projective_transforms @@compose_transforms @@adjust_yiq_hsv +@@flat_transforms_to_matrices +@@matrices_to_flat_transforms @@random_yiq_hsv @@rotate @@transform @@ -58,6 +60,8 @@ from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_ from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms from tensorflow.contrib.image.python.ops.image_ops import compose_transforms from tensorflow.contrib.image.python.ops.image_ops import connected_components +from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices +from tensorflow.contrib.image.python.ops.image_ops import matrices_to_flat_transforms from tensorflow.contrib.image.python.ops.image_ops import rotate from tensorflow.contrib.image.python.ops.image_ops import transform from tensorflow.contrib.image.python.ops.image_ops import translate From ab5abfa42bdced7bf1c371e5e1224bdc1fafdcc1 Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Thu, 19 Apr 2018 14:10:01 -0700 Subject: [PATCH 0456/1734] RecordReader: Simplify interface contract and implementation. Prior to this change, RecordReader had the following contract: - Records can be read in any order, EXCEPT if compression or buffering was enabled. - If the underlying file is being concurrently written to then calls to ReadRecord() may fail (because of an incomplete record near the end of a file), but a retry may succeed (once the record is written), EXCEPT if compression or buffering is enabled (in which case the failure will be terminal). This "retry-may-succeed" behavior is relied upon by tensorboard (https://github.com/tensorflow/tensorboard/blob/1.7/tensorboard/backend/event_processing/event_file_loader.py#L55) where one process (typically the model training process) is writing tf.summary events to an event file and another process (tensorboard) is concurrently reading it. With this change, the intent is to remove the EXCEPTions and have the same behavior irrespective of compression/buffering. Additionally, fix a memory leak when ZlibInputStream::Reset() is invoked. PiperOrigin-RevId: 193571934 --- tensorflow/core/lib/io/record_reader.cc | 147 ++++---------- tensorflow/core/lib/io/record_reader.h | 16 +- tensorflow/core/lib/io/recordio_test.cc | 216 ++++++++++++++------- tensorflow/core/lib/io/zlib_inputstream.cc | 9 +- tensorflow/core/lib/io/zlib_inputstream.h | 10 +- 5 files changed, 208 insertions(+), 190 deletions(-) diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc index 6de850bb207..c24628be570 100644 --- a/tensorflow/core/lib/io/record_reader.cc +++ b/tensorflow/core/lib/io/record_reader.cc @@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions( RecordReader::RecordReader(RandomAccessFile* file, const RecordReaderOptions& options) - : src_(file), options_(options) { + : options_(options), + input_stream_(new RandomAccessInputStream(file)), + last_read_failed_(false) { if (options.buffer_size > 0) { - input_stream_.reset(new BufferedInputStream(file, options.buffer_size)); - } else { - input_stream_.reset(new RandomAccessInputStream(file)); + input_stream_.reset(new BufferedInputStream(input_stream_.release(), + options.buffer_size, true)); } if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) { // We don't have zlib available on all embedded platforms, so fail. #if defined(IS_SLIM_BUILD) LOG(FATAL) << "Zlib compression is unsupported on mobile platforms."; #else // IS_SLIM_BUILD - zlib_input_stream_.reset(new ZlibInputStream( - input_stream_.get(), options.zlib_options.input_buffer_size, - options.zlib_options.output_buffer_size, options.zlib_options)); + input_stream_.reset(new ZlibInputStream( + input_stream_.release(), options.zlib_options.input_buffer_size, + options.zlib_options.output_buffer_size, options.zlib_options, true)); #endif // IS_SLIM_BUILD } else if (options.compression_type == RecordReaderOptions::NONE) { // Nothing to do. } else { - LOG(FATAL) << "Unspecified compression type :" << options.compression_type; + LOG(FATAL) << "Unrecognized compression type :" << options.compression_type; } } // Read n+4 bytes from file, verify that checksum of first n bytes is // stored in the last 4 bytes and store the first n bytes in *result. -// May use *storage as backing store. -Status RecordReader::ReadChecksummed(uint64 offset, size_t n, - StringPiece* result, string* storage) { +// +// offset corresponds to the user-provided value to ReadRecord() +// and is used only in error messages. +Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) { if (n >= SIZE_MAX - sizeof(uint32)) { return errors::DataLoss("record size too large"); } const size_t expected = n + sizeof(uint32); - storage->resize(expected); + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result)); -#if !defined(IS_SLIM_BUILD) - if (zlib_input_stream_) { - // If we have a zlib compressed buffer, we assume that the - // file is being read sequentially, and we use the underlying - // implementation to read the data. - // - // No checks are done to validate that the file is being read - // sequentially. At some point the zlib input buffer may support - // seeking, possibly inefficiently. - TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage)); - - if (storage->size() != expected) { - if (storage->empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - - uint32 masked_crc = core::DecodeFixed32(storage->data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(storage->data(), n); - } else { -#endif // IS_SLIM_BUILD - if (options_.buffer_size > 0) { - // If we have a buffer, we assume that the file is being read - // sequentially, and we use the underlying implementation to read the - // data. - // - // No checks are done to validate that the file is being read - // sequentially. - TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage)); - - if (storage->size() != expected) { - if (storage->empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - - const uint32 masked_crc = core::DecodeFixed32(storage->data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(storage->data(), n); + if (result->size() != expected) { + if (result->empty()) { + return errors::OutOfRange("eof"); } else { - // This version supports reading from arbitrary offsets - // since we are accessing the random access file directly. - StringPiece data; - TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0])); - if (data.size() != expected) { - if (data.empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - const uint32 masked_crc = core::DecodeFixed32(data.data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(data.data(), n); + return errors::DataLoss("truncated record at ", offset); } -#if !defined(IS_SLIM_BUILD) } -#endif // IS_SLIM_BUILD + const uint32 masked_crc = core::DecodeFixed32(result->data() + n); + if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) { + return errors::DataLoss("corrupted record at ", offset); + } + result->resize(n); return Status::OK(); } @@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) { static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32); static const size_t kFooterSize = sizeof(uint32); + // Position the input stream. + int64 curr_pos = input_stream_->Tell(); + int64 desired_pos = static_cast(*offset); + if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ || + (curr_pos == desired_pos && last_read_failed_)) { + last_read_failed_ = false; + TF_RETURN_IF_ERROR(input_stream_->Reset()); + TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos)); + } else if (curr_pos < desired_pos) { + TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos)); + } + DCHECK_EQ(desired_pos, input_stream_->Tell()); + // Read header data. - StringPiece lbuf; - Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record); + Status s = ReadChecksummed(*offset, sizeof(uint64), record); if (!s.ok()) { + last_read_failed_ = true; return s; } - const uint64 length = core::DecodeFixed64(lbuf.data()); + const uint64 length = core::DecodeFixed64(record->data()); // Read data - StringPiece data; - s = ReadChecksummed(*offset + kHeaderSize, length, &data, record); + s = ReadChecksummed(*offset + kHeaderSize, length, record); if (!s.ok()) { + last_read_failed_ = true; if (errors::IsOutOfRange(s)) { s = errors::DataLoss("truncated record at ", *offset); } return s; } - if (record->data() != data.data()) { - // RandomAccessFile placed the data in some other location. - memmove(&(*record)[0], data.data(), data.size()); - } - - record->resize(data.size()); - *offset += kHeaderSize + length + kFooterSize; + DCHECK_EQ(*offset, input_stream_->Tell()); return Status::OK(); } -Status RecordReader::SkipNBytes(uint64 offset) { -#if !defined(IS_SLIM_BUILD) - if (zlib_input_stream_) { - TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset)); - } else { -#endif - if (options_.buffer_size > 0) { - TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset)); - } -#if !defined(IS_SLIM_BUILD) - } -#endif - return Status::OK(); -} // namespace io - SequentialRecordReader::SequentialRecordReader( RandomAccessFile* file, const RecordReaderOptions& options) : underlying_(file, options), offset_(0) {} diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h index 26278e03284..f6d587dfa0e 100644 --- a/tensorflow/core/lib/io/record_reader.h +++ b/tensorflow/core/lib/io/record_reader.h @@ -69,25 +69,14 @@ class RecordReader { // Read the record at "*offset" into *record and update *offset to // point to the offset of the next record. Returns OK on success, // OUT_OF_RANGE for end of file, or something else for an error. - // - // Note: if buffering is used (with or without compression), access must be - // sequential. Status ReadRecord(uint64* offset, string* record); - // Skip the records till "offset". Returns OK on success, - // OUT_OF_RANGE for end of file, or something else for an error. - Status SkipNBytes(uint64 offset); - private: - Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result, - string* storage); + Status ReadChecksummed(uint64 offset, size_t n, string* result); - RandomAccessFile* src_; RecordReaderOptions options_; std::unique_ptr input_stream_; -#if !defined(IS_SLIM_BUILD) - std::unique_ptr zlib_input_stream_; -#endif // IS_SLIM_BUILD + bool last_read_failed_; TF_DISALLOW_COPY_AND_ASSIGN(RecordReader); }; @@ -121,7 +110,6 @@ class SequentialRecordReader { return errors::InvalidArgument( "Trying to seek offset: ", offset, " which is less than the current offset: ", offset_); - TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_)); offset_ = offset; return Status::OK(); } diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc index 63235761d92..da514bd21c7 100644 --- a/tensorflow/core/lib/io/recordio_test.cc +++ b/tensorflow/core/lib/io/recordio_test.cc @@ -26,10 +26,11 @@ limitations under the License. namespace tensorflow { namespace io { +namespace { // Construct a string of the specified length made out of the supplied // partial string. -static string BigString(const string& partial_string, size_t n) { +string BigString(const string& partial_string, size_t n) { string result; while (result.size() < n) { result.append(partial_string); @@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) { } // Construct a string from a number -static string NumberString(int n) { +string NumberString(int n) { char buf[50]; snprintf(buf, sizeof(buf), "%d.", n); return string(buf); } // Return a skewed potentially long string -static string RandomSkewedString(int i, random::SimplePhilox* rnd) { +string RandomSkewedString(int i, random::SimplePhilox* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } +class StringDest : public WritableFile { + public: + explicit StringDest(string* contents) : contents_(contents) {} + + Status Close() override { return Status::OK(); } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Append(const StringPiece& slice) override { + contents_->append(slice.data(), slice.size()); + return Status::OK(); + } + + private: + string* contents_; +}; + +class StringSource : public RandomAccessFile { + public: + explicit StringSource(string* contents) + : contents_(contents), force_error_(false) {} + + Status Read(uint64 offset, size_t n, StringPiece* result, + char* scratch) const override { + if (force_error_) { + force_error_ = false; + return errors::DataLoss("read error"); + } + + if (offset >= contents_->size()) { + return errors::OutOfRange("end of file"); + } + + if (contents_->size() < offset + n) { + n = contents_->size() - offset; + } + *result = StringPiece(contents_->data() + offset, n); + return Status::OK(); + } + + void force_error() { force_error_ = true; } + + private: + string* contents_; + mutable bool force_error_; +}; + class RecordioTest : public ::testing::Test { private: - class StringDest : public WritableFile { - public: - string contents_; - - Status Close() override { return Status::OK(); } - Status Flush() override { return Status::OK(); } - Status Sync() override { return Status::OK(); } - Status Append(const StringPiece& slice) override { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public RandomAccessFile { - public: - StringPiece contents_; - mutable bool force_error_; - mutable bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) {} - - Status Read(uint64 offset, size_t n, StringPiece* result, - char* scratch) const override { - EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error"; - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return errors::DataLoss("read error"); - } - - if (offset >= contents_.size()) { - return errors::OutOfRange("end of file"); - } - - if (contents_.size() < offset + n) { - n = contents_.size() - offset; - returned_partial_ = true; - } - *result = StringPiece(contents_.data() + offset, n); - return Status::OK(); - } - }; - + string contents_; StringDest dest_; StringSource source_; bool reading_; @@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test { public: RecordioTest() - : reading_(false), + : dest_(&contents_), + source_(&contents_), + reading_(false), readpos_(0), writer_(new RecordWriter(&dest_)), reader_(new RecordReader(&source_)) {} @@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test { TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg))); } - size_t WrittenBytes() const { return dest_.contents_.size(); } + size_t WrittenBytes() const { return contents_.size(); } string Read() { if (!reading_) { reading_ = true; - source_.contents_ = StringPiece(dest_.contents_); } string record; Status s = reader_->ReadRecord(&readpos_, &record); @@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test { } } - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } + void IncrementByte(int offset, int delta) { contents_[offset] += delta; } - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } + void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; } - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } + void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); } void FixChecksum(int header_offset, int len) { // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len); + uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len); crc = crc32c::Mask(crc); - core::EncodeFixed32(&dest_.contents_[header_offset], crc); + core::EncodeFixed32(&contents_[header_offset], crc); } - void ForceError() { source_.force_error_ = true; } + void ForceError() { source_.force_error(); } void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; } @@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test { Write("bar"); Write(BigString("x", 10000)); reading_ = true; - source_.contents_ = StringPiece(dest_.contents_); uint64 offset = WrittenBytes() + offset_past_end; string record; Status s = reader_->ReadRecord(&offset, &record); @@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) { ASSERT_EQ("EOF", Read()); } +void TestNonSequentialReads(const RecordWriterOptions& writer_options, + const RecordReaderOptions& reader_options) { + string contents; + StringDest dst(&contents); + RecordWriter writer(&dst, writer_options); + for (int i = 0; i < 10; ++i) { + TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i; + } + TF_ASSERT_OK(writer.Close()); + + StringSource file(&contents); + RecordReader reader(&file, reader_options); + + string record; + // First read sequentially to fill in the offsets table. + uint64 offsets[10] = {0}; + uint64 offset = 0; + for (int i = 0; i < 10; ++i) { + offsets[i] = offset; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i; + } + + // Read randomly: First go back to record #3 then forward to #8. + offset = offsets[3]; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); + EXPECT_EQ("3.", record); + EXPECT_EQ(offsets[4], offset); + + offset = offsets[8]; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); + EXPECT_EQ("8.", record); + EXPECT_EQ(offsets[9], offset); +} + +TEST_F(RecordioTest, NonSequentialReads) { + TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions()); +} + +TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) { + RecordReaderOptions options; + options.buffer_size = 1 << 10; + TestNonSequentialReads(RecordWriterOptions(), options); +} + +TEST_F(RecordioTest, NonSequentialReadsWithCompression) { + TestNonSequentialReads( + RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), + RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); +} + // Tests of all the error paths in log_reader.cc follow: -static void AssertHasSubstr(StringPiece s, StringPiece expected) { +void AssertHasSubstr(StringPiece s, StringPiece expected) { EXPECT_TRUE(str_util::StrContains(s, expected)) << s << " does not contain " << expected; } +void TestReadError(const RecordWriterOptions& writer_options, + const RecordReaderOptions& reader_options) { + const string wrote = BigString("well hello there!", 100); + string contents; + StringDest dst(&contents); + TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote)); + + StringSource file(&contents); + RecordReader reader(&file, reader_options); + + uint64 offset = 0; + string read; + file.force_error(); + Status status = reader.ReadRecord(&offset, &read); + ASSERT_TRUE(errors::IsDataLoss(status)); + ASSERT_EQ(0, offset); + + // A failed Read() shouldn't update the offset, and thus a retry shouldn't + // lose the record. + status = reader.ReadRecord(&offset, &read); + ASSERT_TRUE(status.ok()) << status; + EXPECT_GT(offset, 0); + EXPECT_EQ(wrote, read); +} + TEST_F(RecordioTest, ReadError) { - Write("foo"); - ForceError(); - AssertHasSubstr(Read(), "Data loss"); + TestReadError(RecordWriterOptions(), RecordReaderOptions()); +} + +TEST_F(RecordioTest, ReadErrorWithBuffering) { + RecordReaderOptions options; + options.buffer_size = 1 << 20; + TestReadError(RecordWriterOptions(), options); +} + +TEST_F(RecordioTest, ReadErrorWithCompression) { + TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), + RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); } TEST_F(RecordioTest, CorruptLength) { @@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } +} // namespace } // namespace io } // namespace tensorflow diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc index 984fbc2810c..bf8dcf0988c 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.cc +++ b/tensorflow/core/lib/io/zlib_inputstream.cc @@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream( InputStreamInterface* input_stream, size_t input_buffer_bytes, // size of z_stream.next_in buffer size_t output_buffer_bytes, // size of z_stream.next_out buffer - const ZlibCompressionOptions& zlib_options) - : input_stream_(input_stream), + const ZlibCompressionOptions& zlib_options, bool owns_input_stream) + : owns_input_stream_(owns_input_stream), + input_stream_(input_stream), input_buffer_capacity_(input_buffer_bytes), output_buffer_capacity_(output_buffer_bytes), z_stream_input_(new Bytef[input_buffer_capacity_]), @@ -41,10 +42,14 @@ ZlibInputStream::~ZlibInputStream() { if (z_stream_) { inflateEnd(z_stream_.get()); } + if (owns_input_stream_) { + delete input_stream_; + } } Status ZlibInputStream::Reset() { TF_RETURN_IF_ERROR(input_stream_->Reset()); + inflateEnd(z_stream_.get()); InitZlibBuffer(); bytes_read_ = 0; return Status::OK(); diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h index 9c7e14441ce..6099e2455d4 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.h +++ b/tensorflow/core/lib/io/zlib_inputstream.h @@ -40,10 +40,13 @@ class ZlibInputStream : public InputStreamInterface { // Create a ZlibInputStream for `input_stream` with a buffer of size // `input_buffer_bytes` bytes for reading contents from `input_stream` and // another buffer with size `output_buffer_bytes` for caching decompressed - // contents. Does *not* take ownership of "input_stream". + // contents. + // + // Takes ownership of `input_stream` iff `owns_input_stream` is true. ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes, size_t output_buffer_bytes, - const ZlibCompressionOptions& zlib_options); + const ZlibCompressionOptions& zlib_options, + bool owns_input_stream = false); ~ZlibInputStream(); @@ -65,7 +68,8 @@ class ZlibInputStream : public InputStreamInterface { private: void InitZlibBuffer(); - InputStreamInterface* input_stream_; // Not owned + const bool owns_input_stream_; + InputStreamInterface* input_stream_; size_t input_buffer_capacity_; // Size of z_stream_input_ size_t output_buffer_capacity_; // Size of z_stream_output_ char* next_unread_byte_; // Next unread byte in z_stream_output_ From a4945fc86cabcf3d5f0b9eaac21bb7c1d1146d57 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Thu, 19 Apr 2018 14:30:27 -0700 Subject: [PATCH 0457/1734] The HLO element type converter must remove side effecting instructions like Rng The CPU backend does not know how to lower bf16 typed RNG nodes so even unused instances of these can't remain in the HLO IR. HloComputation::ReplaceInstruction keeps these Rng nodes around since it doesn't remove side effecting nodes. PiperOrigin-RevId: 193575183 --- .../xla/service/hlo_element_type_converter.cc | 15 ++++- .../hlo_element_type_converter_test.cc | 66 +++++++++++++++++++ .../compiler/xla/service/hlo_instruction.cc | 37 ++++++++--- .../compiler/xla/service/hlo_instruction.h | 28 +++++--- tensorflow/compiler/xla/util.h | 10 +++ 5 files changed, 139 insertions(+), 17 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc index c782d1b0add..d236f83aeb9 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc @@ -178,24 +178,37 @@ StatusOr HloElementTypeConverter::Run(HloModule* module) { if (hlo->shape().element_type() == eliminate_type_) { Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_); + new_hlo = computation->AddInstruction( hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); + new_hlo = ToElementType(new_hlo, eliminate_type_); } else if (ShapeUtil::IsTuple(hlo->shape())) { Shape old_shape = hlo->shape(); Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_, replace_with_type_); + new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands( new_shape, new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); + // Convert the elements of the result of `new_hlo` to produce a new // tuple with shape `old_shape`. new_hlo = ConvertTupleElements(new_hlo, old_shape); } else { new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands( hlo->shape(), new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); } - TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo)); + TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo)); + TF_RETURN_IF_ERROR(hlo->DropAllControlDeps()); + + // NB! We want to replace and remove side effecting instructions like Rng + // as well so we can't rely HloComputation::ReplaceInstruction to reliably + // remove the replaced instruction. + TF_RETURN_IF_ERROR(computation->RemoveInstruction(hlo)); changed = true; } } diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc index cb94d9f19b8..5c5a059e0fd 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc @@ -22,6 +22,12 @@ namespace { namespace op = xla::testing::opcode_matchers; +using ::testing::Contains; +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Not; +using ::testing::ResultOf; + class HloElementTypeConverterTest : public HloTestBase { public: std::unique_ptr CreateModuleFromHloString( @@ -117,5 +123,65 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) { op::Convert(op::GetTupleElement(batch_norm, 2)))); } +TEST_F(HloElementTypeConverterTest, RngIsRemoved) { + const string& hlo_string = R"( +HloModule RngIsRemoved + +ENTRY main { + constant.3 = bf16[] constant(0) + constant.4 = bf16[] constant(1) + ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform +} + )"; + auto module = CreateModuleFromHloString(hlo_string); + HloElementTypeConverter type_converter(BF16, F32); + TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get())); + EXPECT_TRUE(converted); + + std::function is_bf16_rng = + [](const HloInstruction* inst) { + return inst->shape().element_type() == BF16 && + inst->opcode() == HloOpcode::kRng; + }; + + EXPECT_THAT(module->entry_computation()->instructions(), + Not(Contains(ResultOf(is_bf16_rng, Eq(true))))); +} + +TEST_F(HloElementTypeConverterTest, RngCtrlDep) { + const string& hlo_string = R"( +HloModule RngIsRemoved + +ENTRY main { + constant.3 = bf16[] constant(0) + constant.4 = bf16[] constant(1) + rng0 = bf16[1,2000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform + ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform +} + )"; + auto module = CreateModuleFromHloString(hlo_string); + + HloElementTypeConverter type_converter(BF16, F32); + TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get())); + EXPECT_TRUE(converted); + + HloInstruction *rng0, *rng1; + for (auto* inst : module->entry_computation()->instructions()) { + if (inst->opcode() == HloOpcode::kRng) { + const Shape& shape = inst->shape(); + ASSERT_EQ(shape.dimensions_size(), 3); + ASSERT_TRUE(shape.dimensions(1) == 2000 || shape.dimensions(1) == 1000); + if (shape.dimensions(1) == 2000) { + rng0 = inst; + } else { + rng1 = inst; + } + } + } + + EXPECT_THAT(rng0->control_successors(), ElementsAre(rng1)); + EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 6303bcc59f3..a638d54d852 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -1678,14 +1678,35 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) { } Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) { - auto succ_it = std::find(control_successors_.begin(), - control_successors_.end(), instruction); - TF_RET_CHECK(succ_it != control_successors_.end()); - control_successors_.erase(succ_it); - auto pred_it = std::find(instruction->control_predecessors_.begin(), - instruction->control_predecessors_.end(), this); - TF_RET_CHECK(pred_it != instruction->control_predecessors_.end()); - instruction->control_predecessors_.erase(pred_it); + TF_RET_CHECK(instruction->parent() == parent()); + TF_RETURN_IF_ERROR(EraseElementFromVector(&control_successors_, instruction)); + TF_RETURN_IF_ERROR( + EraseElementFromVector(&instruction->control_predecessors_, this)); + return Status::OK(); +} + +Status HloInstruction::DropAllControlDeps() { + for (auto* ctrl_succ : control_successors_) { + TF_RETURN_IF_ERROR( + EraseElementFromVector(&ctrl_succ->control_predecessors_, this)); + } + for (auto* ctrl_pred : control_predecessors_) { + TF_RETURN_IF_ERROR( + EraseElementFromVector(&ctrl_pred->control_successors_, this)); + } + control_successors_.clear(); + control_predecessors_.clear(); + return Status::OK(); +} + +Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) { + for (auto* ctrl_pred : inst->control_predecessors()) { + TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this)); + } + + for (auto* ctrl_succ : inst->control_successors()) { + TF_RETURN_IF_ERROR(this->AddControlDependencyTo(ctrl_succ)); + } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 5a7394f7a65..a5e9aecb9e7 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -557,6 +557,18 @@ class HloInstruction { // 'instruction'. Status RemoveControlDependencyTo(HloInstruction* instruction); + // Drops all control predecessors and successors from this HLO instruction. + Status DropAllControlDeps(); + + // Copies the control predecessors and successors on this HLO instruction to + // `inst`. Does not do a deep copy so this makes sense only if `inst` and + // this HLO are in the same module. + // + // Depending on the use cases we see in practice, in the future we may + // consider folding the logic here into Clone, CloneWithNewOperands and + // ReplaceAllUsesWith by treating control dependencies like data dependencies. + Status CopyAllControlDepsFrom(const HloInstruction* inst); + // Returns the set of control predecessors (successors) of this // instruction. Control predecessors (successors) must execute before (after) // the current instruction. @@ -1148,17 +1160,17 @@ class HloInstruction { // Clones the HLO instruction. The clone will have the same opcode, shape, and // operands. After creation the clone has no uses. "this" (the instruction // cloned from) is not changed. Suffix is the string to append to the name of - // the instruction to form the name of the cloned instruction. - // If the module pointer is not nullptr, it will be the module where - // the cloned computations will be added to (in order to support deep - // cloning). + // the instruction to form the name of the cloned instruction. If the module + // pointer is not nullptr, it will be the module where the cloned computations + // will be added to (in order to support deep cloning). Ignores the control + // predecessors and successors of this HLO instruction. std::unique_ptr Clone(const string& suffix = "clone", HloModule* module = nullptr) const; - // Clones the HLO instruction as above but with new shape and operands. - // If the module pointer is not nullptr, it will be the module where - // the cloned computations will be added to (in order to support deep - // cloning). + // Clones the HLO instruction as above but with new shape and operands. If + // the module pointer is not nullptr, it will be the module where the cloned + // computations will be added to (in order to support deep cloning). Ignores + // the control predecessors and successors of this HLO instruction. std::unique_ptr CloneWithNewOperands( const Shape& shape, tensorflow::gtl::ArraySlice operands, HloModule* module = nullptr) const; diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 2da9f9ed6f4..be33bd6dd13 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -528,6 +528,16 @@ bool IsInt32(T x) { // value is implementation-defined." return static_cast(x) == x; } + +template +Status EraseElementFromVector(std::vector* container, const T& value) { + // c_find returns a const_iterator which does not seem to work on gcc 4.8.4, + // and this breaks the ubuntu/xla_gpu build bot. + auto it = std::find(container->begin(), container->end(), value); + TF_RET_CHECK(it != container->end()); + container->erase(it); + return Status::OK(); +} } // namespace xla #define XLA_LOG_LINES(SEV, STRING) \ From 1aa032b94f630845abf6c3dce8d6623ae9e35b0f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 14:35:27 -0700 Subject: [PATCH 0458/1734] Replaced calls to deprecated tensorflow::StringPiece methods with their tensorflow::str_util equivalents. This will allow the deprecated methods to be removed. PiperOrigin-RevId: 193575992 --- tensorflow/core/platform/test_main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc index 677114f5f22..e57bbd80af4 100644 --- a/tensorflow/core/platform/test_main.cc +++ b/tensorflow/core/platform/test_main.cc @@ -26,7 +26,7 @@ limitations under the License. #include -#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/stacktrace_handler.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" @@ -37,7 +37,7 @@ GTEST_API_ int main(int argc, char** argv) { tensorflow::testing::InstallStacktraceHandler(); testing::InitGoogleTest(&argc, argv); for (int i = 1; i < argc; i++) { - if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) { + if (tensorflow::str_util::StartsWith(argv[i], "--benchmarks=")) { const char* pattern = argv[i] + strlen("--benchmarks="); tensorflow::testing::Benchmark::Run(pattern); return 0; From 470842748b9ee219fa0fcb8e3de25720960c83e3 Mon Sep 17 00:00:00 2001 From: Olivia Nordquist Date: Thu, 19 Apr 2018 14:59:25 -0700 Subject: [PATCH 0459/1734] disabling opensource testing for failing xla test PiperOrigin-RevId: 193579805 --- tensorflow/compiler/xla/python/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 0517a5502e6..0b9333b406d 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -8,6 +8,7 @@ py_library( name = "xla_client", srcs = ["xla_client.py"], srcs_version = "PY2AND3", + tags = ["no_oss"], visibility = ["//visibility:public"], deps = [ ":pywrap_xla", From 2d0a7087a14f015ea49f4b8feb70e0b5ecd41b28 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 15:09:58 -0700 Subject: [PATCH 0460/1734] Only generate floating points that are fractions like n / 256, since they are RGB pixels. This fixes RGBToHSVTest.testBatch on low-precision dtypes like bfloat16. PiperOrigin-RevId: 193581652 --- tensorflow/compiler/tests/image_ops_test.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py index 5b19e993ece..42e637734c5 100644 --- a/tensorflow/compiler/tests/image_ops_test.py +++ b/tensorflow/compiler/tests/image_ops_test.py @@ -34,20 +34,23 @@ from tensorflow.python.ops import image_ops from tensorflow.python.platform import test +def GenerateNumpyRandomRGB(shape): + # Only generate floating points that are fractions like n / 256, since they + # are RGB pixels. Some low-precision floating point types in this test can't + # handle arbitrary precision floating points well. + return np.random.randint(0, 256, shape) / 256. + + class RGBToHSVTest(XLATestCase): def testBatch(self): - # TODO(b/78230407): Reenable the test on GPU. - if self.device == "XLA_GPU": - return - # Build an arbitrary RGB image np.random.seed(7) batch_size = 5 shape = (batch_size, 2, 7, 3) for nptype in self.float_types: - inp = np.random.rand(*shape).astype(nptype) + inp = GenerateNumpyRandomRGB(shape).astype(nptype) # Convert to HSV and back, as a batch and individually with self.test_session() as sess: @@ -87,7 +90,7 @@ class RGBToHSVTest(XLATestCase): def testRGBToHSVNumpy(self): """Tests the RGB to HSV conversion matches a reference implementation.""" for nptype in self.float_types: - rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype) + rgb_flat = GenerateNumpyRandomRGB((64, 3)).astype(nptype) rgb_np = rgb_flat.reshape(4, 4, 4, 3) hsv_np = np.array([ colorsys.rgb_to_hsv( From 38c0d7e1c0ee0617cf73ccf6809bd55d70089233 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 15:27:19 -0700 Subject: [PATCH 0461/1734] Convert a local variable and mutex to a struct so GUARDED_BY annotation works correctly. PiperOrigin-RevId: 193584438 --- tensorflow/core/kernels/sdca_ops.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc index 55e68b348b9..05c835ebc46 100644 --- a/tensorflow/core/kernels/sdca_ops.cc +++ b/tensorflow/core/kernels/sdca_ops.cc @@ -156,8 +156,10 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) { } else { examples.RandomShuffle(); } - mutex mu; - Status train_step_status GUARDED_BY(mu); + struct { + mutex mu; + Status value GUARDED_BY(mu); + } train_step_status; std::atomic atomic_index(-1); auto train_step = [&](const int64 begin, const int64 end) { // The static_cast here is safe since begin and end can be at most @@ -171,8 +173,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) { const Status conversion_status = options.loss_updater->ConvertLabel(&example_label); if (!conversion_status.ok()) { - mutex_lock l(mu); - train_step_status = conversion_status; + mutex_lock l(train_step_status.mu); + train_step_status.value = conversion_status; // Return from this worker thread - the calling thread is // responsible for checking context status and returning on error. return; @@ -217,7 +219,8 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) { Shard(worker_threads.num_threads, worker_threads.workers, examples.num_examples(), kCostPerUnit, train_step); - OP_REQUIRES_OK(context, train_step_status); + mutex_lock l(train_step_status.mu); + OP_REQUIRES_OK(context, train_step_status.value); } } // namespace From 4bcf49c4b22205fc829f89da96e37f366c9fa9e6 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Thu, 19 Apr 2018 15:29:21 -0700 Subject: [PATCH 0462/1734] Prevent a bool field from being accessed when uninitialized. PiperOrigin-RevId: 193584746 --- tensorflow/core/distributed_runtime/message_wrappers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h index 92c5668e3a1..72a0c7edd8e 100644 --- a/tensorflow/core/distributed_runtime/message_wrappers.h +++ b/tensorflow/core/distributed_runtime/message_wrappers.h @@ -353,7 +353,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper { private: string session_handle_; - bool create_worker_session_called_; + bool create_worker_session_called_ = false; string graph_handle_; int64 step_id_; ExecutorOpts exec_opts_; From 4868ddd508a567a497935378956e9da18976f152 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Thu, 19 Apr 2018 15:32:37 -0700 Subject: [PATCH 0463/1734] Simplifying cols_to_vars update PiperOrigin-RevId: 193585237 --- tensorflow/python/feature_column/feature_column.py | 6 ++---- tensorflow/python/feature_column/feature_column_test.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 87a52f84415..a7c4eabcb26 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -417,10 +417,8 @@ def linear_model(features, trainable=trainable, name='linear_model') retval = linear_model_layer(features) # pylint: disable=not-callable - if cols_to_vars is None: - return retval - for k, v in linear_model_layer.cols_to_vars().items(): - cols_to_vars[k] = v + if cols_to_vars is not None: + cols_to_vars.update(linear_model_layer.cols_to_vars()) return retval diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py index 49e06b82453..d963dd9b551 100644 --- a/tensorflow/python/feature_column/feature_column_test.py +++ b/tensorflow/python/feature_column/feature_column_test.py @@ -1269,10 +1269,8 @@ def get_keras_linear_model_predictions(features, trainable, name='linear_model') retval = keras_linear_model(features) # pylint: disable=not-callable - if cols_to_vars is None: - return retval - for k, v in keras_linear_model.cols_to_vars().items(): - cols_to_vars[k] = v + if cols_to_vars is not None: + cols_to_vars.update(keras_linear_model.cols_to_vars()) return retval From f500bcb889b3598f386f59eb69a79af6b704bf50 Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 01:41:28 +0300 Subject: [PATCH 0464/1734] [tf.data] Allow `sample_from_datasets` to accept a tf.Dataset object for `weights`. Tested: bazel test :interleave_dataset_op_test --- .../interleave_dataset_op_test.py | 59 +++++++++++-------- .../contrib/data/python/ops/interleave_ops.py | 25 ++++---- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py index ff6d0c31aa8..43aa4b1bd02 100644 --- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py @@ -928,8 +928,7 @@ class DirectedInterleaveDatasetTest(test.TestCase): sess.run(next_element) def _normalize(self, vec): - batched = (len(vec.shape) == 2) - return vec / vec.sum(axis=1, keepdims=True) if batched else vec / vec.sum() + return vec / vec.sum() def _chi2(self, expected, actual): actual = np.asarray(actual) @@ -938,35 +937,43 @@ class DirectedInterleaveDatasetTest(test.TestCase): chi2 = np.sum(diff * diff / expected, axis=0) return chi2 + def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples): + # Create a dataset that samples each integer in `[0, num_datasets)` + # with probability given by `weights[i]`. + dataset = interleave_ops.sample_from_datasets([ + dataset_ops.Dataset.from_tensors(i).repeat(None) + for i in range(num_datasets) + ], weights) + dataset = dataset.take(num_samples) + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + freqs = np.zeros([num_datasets]) + for _ in range(num_samples): + freqs[sess.run(next_element)] += 1 + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + + return freqs + def testSampleFromDatasets(self): - random_seed.set_random_seed(1618) + random_seed.set_random_seed(1619) num_samples = 10000 - rand_probs = self._normalize(np.random.random_sample((10,))) - rand_probs2 = self._normalize(np.random.random_sample((15,))) + rand_probs = self._normalize(np.random.random_sample((15,))) - for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]: + # Use chi-squared test to assert that the observed distribution matches the + # expected distribution. Based on the implementation in + # "tensorflow/python/kernel_tests/multinomial_op_test.py". + for probs in [[.85, .05, .1], rand_probs]: probs = np.asarray(probs) + classes = len(probs) + freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples) + self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3) - # Create a dataset that samples each integer in `[0, probs.shape[0])` - # with probability given by `probs[i]`. - dataset = interleave_ops.sample_from_datasets([ - dataset_ops.Dataset.from_tensors(i).repeat(None) - for i in range(probs.shape[0]) - ], probs) - dataset = dataset.take(num_samples) - iterator = dataset.make_one_shot_iterator() - next_element = iterator.get_next() - - with self.test_session() as sess: - freqs = np.zeros_like(probs) - for _ in range(num_samples): - freqs[sess.run(next_element)] += 1 - with self.assertRaises(errors.OutOfRangeError): - sess.run(next_element) - - # Use chi-squared test to assert that the observed distribution - # matches the expected distribution. Based on the implementation - # in "tensorflow/python/kernel_tests/multinomial_op_test.py". + # Also check that `weights` as a dataset samples correctly. + probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat() + freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples) self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3) def testErrors(self): diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py index 106a1ef388a..5ae1fa9e9e1 100644 --- a/tensorflow/contrib/data/python/ops/interleave_ops.py +++ b/tensorflow/contrib/data/python/ops/interleave_ops.py @@ -200,10 +200,10 @@ def sample_from_datasets(datasets, weights=None, seed=None): Args: datasets: A list of @{tf.data.Dataset} objects with compatible structure. - weights: (Optional.) A list of `len(datasets)` floating-point values, - where `weights[i]` represents the probability with which an element - should be sampled from `datasets[i]`. Defaults to a uniform distribution - across `datasets`. + weights: (Optional.) A list of `len(datasets)` floating-point values or a + @{tf.data.Dataset} object, where `weights[i]` represents the probability + with which an element should be sampled from `datasets[i]`. Defaults to a + uniform distribution across `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See @{tf.set_random_seed} for behavior. @@ -219,24 +219,23 @@ def sample_from_datasets(datasets, weights=None, seed=None): """ num_datasets = len(datasets) if weights is None: - weights = array_ops.ones( - [num_datasets], dtype=dtypes.float32, name="weights") - else: + weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat() + elif not isinstance(weights, dataset_ops.Dataset): weights = ops.convert_to_tensor(weights, name="weights") if weights.dtype not in (dtypes.float32, dtypes.float64): raise TypeError("`weights` must be convertible to a tensor of " "`tf.float32` or `tf.float64` elements.") if not weights.shape.is_compatible_with([num_datasets]): raise ValueError("`weights` must be a vector of length `len(datasets)`.") + weights = dataset_ops.Dataset.from_tensors(weights).repeat() # The `stateless_multinomial()` op expects log-probabilities, as opposed to # weights. - logits = math_ops.log(weights, name="logits") - - def select_dataset(seed): + logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits")) + def select_dataset(logits, seed): return array_ops.squeeze( - stateless.stateless_multinomial([logits], 1, seed=seed), axis=[0, 1]) - - selector_input = random_ops.RandomDataset(seed).batch(2).map(select_dataset) + stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1]) + selector_input = dataset_ops.Dataset.zip( + (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset) return DirectedInterleaveDataset(selector_input, datasets) From d5c32f4ccc85ad0d13f3a1f83e063211504cf976 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 19 Apr 2018 15:55:53 -0700 Subject: [PATCH 0465/1734] Internal-only change. PiperOrigin-RevId: 193588868 --- tensorflow/contrib/data/python/kernel_tests/BUILD | 1 + tensorflow/contrib/estimator/BUILD | 1 + tensorflow/contrib/learn/BUILD | 5 ++++- tensorflow/python/kernel_tests/BUILD | 3 +++ tensorflow/python/kernel_tests/linalg/BUILD | 5 ++++- 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index 83daa04efc9..05a4f5028ab 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -216,6 +216,7 @@ py_test( srcs_version = "PY2AND3", tags = [ "no_pip", + "noasan", # times out "optonly", ], deps = [ diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD index 9e88bc7de1a..62ddb3d290e 100644 --- a/tensorflow/contrib/estimator/BUILD +++ b/tensorflow/contrib/estimator/BUILD @@ -447,6 +447,7 @@ py_test( srcs_version = "PY2AND3", tags = [ "no_pip", + "noasan", # times out "notsan", ], deps = [ diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD index d665fc9335c..3b053cd4c66 100644 --- a/tensorflow/contrib/learn/BUILD +++ b/tensorflow/contrib/learn/BUILD @@ -281,7 +281,10 @@ py_test( size = "medium", srcs = ["python/learn/estimators/estimator_test.py"], srcs_version = "PY2AND3", - tags = ["manual"], + tags = [ + "manual", + "noasan", # times out + ], deps = [ ":learn", "//tensorflow/contrib/framework:framework_py", diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 9440f2a4f99..8628ca5d401 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1190,6 +1190,9 @@ cuda_py_test( "//tensorflow/python/eager:context", ], shard_count = 10, + tags = [ + "noasan", # times out + ], ) cuda_py_test( diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD index 4e3f24890b2..7ffa48b6530 100644 --- a/tensorflow/python/kernel_tests/linalg/BUILD +++ b/tensorflow/python/kernel_tests/linalg/BUILD @@ -123,7 +123,10 @@ cuda_py_test( "//tensorflow/python:platform_test", ], shard_count = 5, - tags = ["optonly"], + tags = [ + "noasan", # times out + "optonly", + ], ) cuda_py_test( From 9e5fdb83e609701457f6fdc2d153b1f7e83ead6c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 15:56:17 -0700 Subject: [PATCH 0466/1734] Automated g4 rollback of changelist 193564222 PiperOrigin-RevId: 193588935 --- tensorflow/contrib/image/kernels/image_ops.cc | 7 +-- tensorflow/contrib/image/kernels/image_ops.h | 2 +- tensorflow/contrib/image/ops/image_ops.cc | 52 ++----------------- .../python/kernel_tests/image_ops_test.py | 30 ----------- .../contrib/image/python/ops/image_ops.py | 39 ++++++-------- 5 files changed, 23 insertions(+), 107 deletions(-) diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc index ae4b1ba62a8..c2e32da133b 100644 --- a/tensorflow/contrib/image/kernels/image_ops.cc +++ b/tensorflow/contrib/image/kernels/image_ops.cc @@ -70,7 +70,6 @@ class ImageProjectiveTransform : public OpKernel { void Compute(OpKernelContext* ctx) override { const Tensor& images_t = ctx->input(0); const Tensor& transform_t = ctx->input(1); - const Tensor& output_dim = ctx->input(2); OP_REQUIRES(ctx, images_t.shape().dims() == 4, errors::InvalidArgument("Input images must have rank 4")); OP_REQUIRES(ctx, @@ -84,11 +83,7 @@ class ImageProjectiveTransform : public OpKernel { auto images = images_t.tensor(); auto transform = transform_t.matrix(); Tensor* output_t; - // Image is NHWC format. - auto output_shape = images_t.shape(); - output_shape.set_dim(1, output_dim.vec()(0)); - output_shape.set_dim(2, output_dim.vec()(1)); - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t)); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t)); auto output = output_t->tensor(); (FillProjectiveTransform(interpolation_))( ctx->eigen_device(), &output, images, transform); diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h index 2320329b923..ad501330617 100644 --- a/tensorflow/contrib/image/kernels/image_ops.h +++ b/tensorflow/contrib/image/kernels/image_ops.h @@ -161,7 +161,7 @@ struct FillProjectiveTransform { void operator()(const Device& device, OutputType* output, const InputType& images, const TransformsType& transform) const { - output->device(device) = output->generate( + output->device(device) = images.generate( ProjectiveGenerator(images, transform, interpolation_)); } }; diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc index 4c6d8c0d192..68771b3d054 100644 --- a/tensorflow/contrib/image/ops/image_ops.cc +++ b/tensorflow/contrib/image/ops/image_ops.cc @@ -19,55 +19,9 @@ limitations under the License. namespace tensorflow { -using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; -namespace { - -// Sets output[0] to shape [batch_dim,height,width,channel_dim], where -// height and width come from the size_tensor. -Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim, - int size_input_idx, DimensionHandle channel_dim) { - // Verify shape of size input. - ShapeHandle size; - TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size)); - DimensionHandle unused; - TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused)); - - // Get size values from the size tensor. - const Tensor* size_tensor = c->input_tensor(size_input_idx); - DimensionHandle width; - DimensionHandle height; - if (size_tensor == nullptr) { - width = c->UnknownDim(); - height = c->UnknownDim(); - } else { - // TODO(petewarden) - Remove once we have constant evaluation in C++ only. - if (size_tensor->dtype() != DT_INT32) { - return errors::InvalidArgument( - "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 " - "but got ", - DataTypeString(size_tensor->dtype()), " for input #", size_input_idx, - " in ", c->DebugString()); - } - auto vec = size_tensor->vec(); - height = c->MakeDim(vec(0)); - width = c->MakeDim(vec(1)); - } - c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim})); - return Status::OK(); -} - -Status ResizeShapeFn(InferenceContext* c) { - ShapeHandle input; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); - return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */, - c->Dim(input, 3)); -} - -} // namespace - // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc. // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0). // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to @@ -75,11 +29,13 @@ Status ResizeShapeFn(InferenceContext* c) { REGISTER_OP("ImageProjectiveTransform") .Input("images: dtype") .Input("transforms: float32") - .Input("output_shape: int32") .Attr("dtype: {uint8, int32, int64, float32, float64}") .Attr("interpolation: string") .Output("transformed_images: dtype") - .SetShapeFn(ResizeShapeFn) + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->input(0)); + return Status::OK(); + }) .Doc(R"doc( Applies the given transform to each of the images. diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index c0151d320f9..b50177ae565 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -195,40 +195,10 @@ class ImageOpsTest(test_util.TensorFlowTestCase): x_init_value=test_image) self.assertLess(left_err, 1e-10) - def _test_grad_different_shape(self, input_shape, output_shape): - with self.test_session(): - test_image_shape = input_shape - test_image = np.random.randn(*test_image_shape) - test_image_tensor = constant_op.constant( - test_image, shape=test_image_shape) - test_transform = image_ops.angles_to_projective_transforms( - np.pi / 2, 4, 4) - - if len(output_shape) == 2: - resize_shape = output_shape - elif len(output_shape) == 3: - resize_shape = output_shape[0:2] - elif len(output_shape) == 4: - resize_shape = output_shape[1:3] - output = image_ops.transform( - images=test_image_tensor, - transforms=test_transform, - output_shape=resize_shape) - left_err = gradient_checker.compute_gradient_error( - test_image_tensor, - test_image_shape, - output, - output_shape, - x_init_value=test_image) - self.assertLess(left_err, 1e-10) - def test_grad(self): self._test_grad([16, 16]) self._test_grad([4, 12, 12]) self._test_grad([3, 4, 12, 12]) - self._test_grad_different_shape([16, 16], [8, 8]) - self._test_grad_different_shape([4, 12, 3], [8, 24, 3]) - self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3]) class BipartiteMatchTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index 0cb7bdc75dd..c139ae89d8d 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -212,11 +212,7 @@ def translations_to_projective_transforms(translations, name=None): axis=1) -def transform(images, - transforms, - output_shape=None, - interpolation="NEAREST", - name=None): +def transform(images, transforms, interpolation="NEAREST", name=None): """Applies the given transform(s) to the image(s). Args: @@ -232,10 +228,7 @@ def transform(images, where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the transform mapping input points to output points. Note that gradients are not backpropagated into transformation parameters. - output_shape: Output dimesion after the transform, [height, width]. - If None, output is the same size as input image. interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR". - name: The name of the op. Returns: Image(s) with the same type and shape as `images`, with the given @@ -262,14 +255,6 @@ def transform(images, else: raise TypeError("Images should have rank between 2 and 4.") - if output_shape is None: - output_shape = images.get_shape()[1:3] - elif len(output_shape) != 2: - raise TypeError( - "output_shape must either be None or a vector of 2 elements.") - output_shape = ops.convert_to_tensor( - output_shape, name="output_shape", dtype=dtypes.int32) - if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif transform_or_transforms.get_shape().ndims is None: @@ -280,7 +265,7 @@ def transform(images, else: raise TypeError("Transforms should have rank 1 or 2.") output = gen_image_ops.image_projective_transform( - images, transforms, output_shape, interpolation=interpolation.upper()) + images, transforms, interpolation=interpolation.upper()) if len(image_or_images.get_shape()) == 2: return output[0, :, :, 0] elif len(image_or_images.get_shape()) == 3: @@ -390,6 +375,14 @@ def _image_projective_transform_grad(op, grad): if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES: raise TypeError("Invalid dtype %s." % image_or_images.dtype) + if len(image_or_images.get_shape()) == 2: + images = image_or_images[None, :, :, None] + elif len(image_or_images.get_shape()) == 3: + images = image_or_images[None, :, :, :] + elif len(image_or_images.get_shape()) == 4: + images = image_or_images + else: + raise TypeError("Images should have rank between 2 and 4") if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif len(transform_or_transforms.get_shape()) == 2: @@ -402,11 +395,13 @@ def _image_projective_transform_grad(op, grad): inverse = linalg_ops.matrix_inverse(transforms) transforms = matrices_to_flat_transforms(inverse) output = gen_image_ops.image_projective_transform( - images=grad, - transforms=transforms, - output_shape=image_or_images.get_shape()[1:3], - interpolation=interpolation) - return [output, None, None] + grad, transforms, interpolation=interpolation) + if len(image_or_images.get_shape()) == 2: + return [output[0, :, :, 0], None] + elif len(image_or_images.get_shape()) == 3: + return [output[0, :, :, :], None] + else: + return [output, None] def bipartite_match(distance_mat, From c3f5d8c53295d9740c622f5221464c23559747ad Mon Sep 17 00:00:00 2001 From: Yifei Feng <1192265+yifeif@users.noreply.github.com> Date: Thu, 19 Apr 2018 16:02:09 -0700 Subject: [PATCH 0467/1734] Update install_python3.5_pip_packages.sh --- .../tools/ci_build/install/install_python3.5_pip_packages.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh index aefc49f6048..204a82f647e 100755 --- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -39,6 +39,9 @@ if [[ -z $pip35_version ]]; then fi set -e +pip3.5 install --upgrade setuptools +pip3.5 install --upgrade pip + pip3.5 install --upgrade virtualenv # Install six. From d4402725d2f6d9a8c5273ab1474117a27dd455c9 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 19 Apr 2018 16:30:02 -0700 Subject: [PATCH 0468/1734] Make xla/service:cpu_plugin depend on the StreamExecutor host platform. PiperOrigin-RevId: 193593761 --- tensorflow/compiler/xla/service/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 9009cbf845e..d5d09bd8a3a 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -699,6 +699,7 @@ cc_library( "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:stream_executor_impl", ], ) From 704ac94a8e362feb3710391787342fe36187b9ef Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 19 Apr 2018 16:30:26 -0700 Subject: [PATCH 0469/1734] Cleaned up the handling of merge nodes PiperOrigin-RevId: 193593810 --- .../core/grappler/costs/graph_properties.cc | 89 +++++++------------ 1 file changed, 32 insertions(+), 57 deletions(-) diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index dd2d53dfdfb..a0125ce3426 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -670,6 +670,29 @@ class SymbolicShapeRefiner { return true; } + Status AddNode(const Node* node) { + // Create the inference context for this node. + std::vector input_shapes(node->num_inputs()); + std::vector>> + input_handle_shapes_and_types(node->num_inputs()); + std::vector input_tensors(node->num_inputs(), nullptr); + std::vector input_tensors_as_shapes; + + NodeContext& node_ctx = node_to_context_[node]; + TF_RETURN_IF_ERROR( + function_library_.LookUp(node->type_string(), &node_ctx.op_data)); + + node_ctx.inference_context.reset(new InferenceContext( + graph_def_version_, &node->def(), node->op_def(), input_shapes, + input_tensors, input_tensors_as_shapes, + std::move(input_handle_shapes_and_types))); + const Status s = node_ctx.inference_context->construction_status(); + if (!s.ok()) { + node_ctx.inference_context.reset(nullptr); + } + return s; + } + private: // Return the one ShapeHandle used to denote a fully unknown shape for a node // output. @@ -698,29 +721,6 @@ class SymbolicShapeRefiner { return dim; } - Status AddNode(const Node* node) { - // Create the inference context for this node. - std::vector input_shapes(node->num_inputs()); - std::vector>> - input_handle_shapes_and_types(node->num_inputs()); - std::vector input_tensors(node->num_inputs(), nullptr); - std::vector input_tensors_as_shapes; - - NodeContext& node_ctx = node_to_context_[node]; - TF_RETURN_IF_ERROR( - function_library_.LookUp(node->type_string(), &node_ctx.op_data)); - - node_ctx.inference_context.reset(new InferenceContext( - graph_def_version_, &node->def(), node->op_def(), input_shapes, - input_tensors, input_tensors_as_shapes, - std::move(input_handle_shapes_and_types))); - const Status s = node_ctx.inference_context->construction_status(); - if (!s.ok()) { - node_ctx.inference_context.reset(nullptr); - } - return s; - } - struct NodeContext { const OpRegistrationData* op_data; std::unique_ptr inference_context; @@ -929,37 +929,16 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, bool* new_shapes) const { InferenceContext* c = shape_refiner->GetContext(node); if (!c) { - // The shape refiner can't handle loops. Therefore we first need to remove - // all edges - std::vector edges; - std::vector edge_ptrs; - for (const Edge* edge : node->in_edges()) { - if (!edge->IsControlEdge()) { - edges.push_back(*edge); - edge_ptrs.push_back(edge); - } - } - for (const Edge* edge : edge_ptrs) { - if (!edge->IsControlEdge()) { - graph_->RemoveEdge(edge); - } - } // Now we can run shape inference - TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes)); - // And add all the edges back - for (const Edge& edge : edges) { - graph_->AddEdge(edge.src(), edge.src_output(), edge.dst(), - edge.dst_input()); - } - - c = shape_refiner->GetContext(node); + TF_RETURN_IF_ERROR(shape_refiner->AddNode(node)); + c = CHECK_NOTNULL(shape_refiner->GetContext(node)); *new_shapes = true; - CHECK_NE(c, nullptr); - } - ShapeHandle out1; - TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1)); - c->set_output(1, out1); + // Infer the shape of the second output once and for all since it never + // changes. + ShapeHandle out1 = c->Scalar(); + c->set_output(1, out1); + } ShapeHandle out; bool out_initialized = false; @@ -981,11 +960,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, continue; } ShapeHandle input = in->output(e->src_output()); - if (relax) { - c->RelaxInput(e->dst_input(), input); - } else { - c->MergeInput(e->dst_input(), input); - } + c->SetInput(e->dst_input(), input); if (!out_initialized) { out_initialized = true; out = input; @@ -998,7 +973,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, } } - if (!shape_refiner->EquivalentShapes(out, c->output(0))) { + if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) { c->set_output(0, out); *new_shapes = true; } From c93a883fcea141dc0f63fe63afcd9490e39e3eaf Mon Sep 17 00:00:00 2001 From: Mark Heffernan Date: Thu, 19 Apr 2018 16:35:40 -0700 Subject: [PATCH 0470/1734] Improve error messages for LiteralTestUtil::Near. Previously error messages for mismatches were difficult to read with much of the space taken by useless stack traces. This CL cleans up the message considerably and adds additional information including statistics about the values and mismatches. PiperOrigin-RevId: 193594593 --- .../compiler/xla/tests/literal_test_util.cc | 796 +++++++++++------- .../compiler/xla/tests/literal_test_util.h | 9 +- .../xla/tests/literal_test_util_test.cc | 2 +- 3 files changed, 485 insertions(+), 322 deletions(-) diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc index 81630df34c5..c28f79ae386 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util.cc @@ -39,6 +39,11 @@ limitations under the License. namespace xla { +using ::tensorflow::strings::Appendf; +using ::tensorflow::strings::Printf; +using ::tensorflow::strings::StrAppend; +using ::tensorflow::strings::StrCat; + /* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes( const Shape& expected, const Shape& actual) { if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) { @@ -173,14 +178,11 @@ template auto lhs_double = static_cast(lhs); auto rhs_double = static_cast(rhs); if (ulhs != urhs) { - return ::testing::AssertionFailure() << tensorflow::strings::Printf( + return ::testing::AssertionFailure() << Printf( "floating values are not bitwise-equal; and equality testing " "was requested: %s=%g=%a vs %s=%g=%a", - tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs)) - .c_str(), - lhs_double, lhs_double, - tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs)) - .c_str(), + StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, + lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double); } return ::testing::AssertionSuccess(); @@ -264,9 +266,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, << "expected:\n" << expected.ToString() << "\n\tvs actual:\n" << actual.ToString() - << (message.empty() - ? "" - : tensorflow::strings::StrCat("\nmessage: ", message)); + << (message.empty() ? "" : StrCat("\nmessage: ", message)); } /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected, @@ -321,9 +321,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, case TUPLE: { bool tuple_match = true; for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { - SCOPED_TRACE(tensorflow::strings::StrCat( - "Tuple index ", i, " in ", - ShapeUtil::HumanString(expected.shape()))); + SCOPED_TRACE(StrCat("Tuple index ", i, " in ", + ShapeUtil::HumanString(expected.shape()))); // Create LiteralViews of the expected and actual elements. auto result = Equal(LiteralView::Create(expected, {i}), @@ -350,227 +349,301 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, namespace { +// Gets the total element count. For tuples, this is not the count of tuple +// elements, but the sum of elements of each tuple element. +int64 RecursiveElementCount(const Shape& shape) { + if (ShapeUtil::IsTuple(shape)) { + const int64 tuple_elements = ShapeUtil::TupleElementCount(shape); + int64 total = 0; + for (int64 i = 0; i < tuple_elements; ++i) { + total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i)); + } + return total; + } else { + return ShapeUtil::ElementsIn(shape); + } +} + +// Calling ToString on a literal with over 100 million elements takes around +// 3 minutes. The utility of printing a literal with >1000 elements is +// questionable, especially when writing the Literal proto to disk is orders +// of magnitude faster. +string TruncateHugeLiteral(const Literal& literal) { + return RecursiveElementCount(literal.shape()) < 1000 + ? literal.ToString() + : "[TRUNCATED, Literal with more than 1000 values]"; +} + +// Returns whether the actual and expected values are mismatched with respect to +// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec. +template +bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) { + if (relaxed_nans) { + return !std::isnan(expected) && std::isnan(actual); + } else { + return std::isnan(expected) != std::isnan(actual); + } +} + +template <> +bool NanMismatch(complex64 expected, complex64 actual, + bool relaxed_nans) { + return NanMismatch(expected.real(), actual.real(), relaxed_nans) || + NanMismatch(expected.imag(), actual.imag(), relaxed_nans); +} + +template <> +bool NanMismatch(half expected, half actual, bool relaxed_nans) { + return NanMismatch(static_cast(expected), + static_cast(actual), relaxed_nans); +} + +// Converts the given floating-point value to a string. +template +string FpValueToString(NativeT value) { + return Printf("%8.4g", static_cast(value)); +} + +template <> +string FpValueToString(complex64 value) { + return Printf("%8.4g + %8.4fi", value.real(), value.imag()); +} + +// Returns the absolute value of the given floating point value. This function +// is used instead of std::abs directly in order to allow type-dependent +// implementations for NearComparator. +template +float FpAbsoluteValue(NativeT value) { + return std::abs(value); +} + +template <> +float FpAbsoluteValue(bfloat16 value) { + return FpAbsoluteValue(static_cast(value)); +} + +template <> +float FpAbsoluteValue(half value) { + return FpAbsoluteValue(static_cast(value)); +} + // Helper class for comparing floating-point literals within an error bound. +template class NearComparator { public: - explicit NearComparator(ErrorSpec error) : error_(error) {} - - // Compares the two literals elementwise. EXPECTs each pair of elements to be - // within the error bound. Emits useful log messages and dumps literals to - // temporary files on failure. Returns true if literals match. - bool ExpectNear(const Literal& expected, const Literal& actual) { - VLOG(1) << "expected:"; - XLA_VLOG_LINES(1, TruncateHugeLiteral(expected)); - VLOG(1) << "actual:"; - XLA_VLOG_LINES(1, TruncateHugeLiteral(actual)); - - // If the shapes mismatch, we simply fail the expectation instead of - // printing out data, as it's a type error rather than a value error. - ::testing::AssertionResult equal_shapes = - LiteralTestUtil::EqualShapes(expected.shape(), actual.shape()); - if (!equal_shapes) { - EXPECT_TRUE(equal_shapes); - return false; - } - - // Set up members used during the comparison. - num_miscompares_ = 0; - abs_diff_sum_ = 0.0; - abs_expected_sum_ = 0.0; - abs_diff_miscompare_sum_ = 0.0; - abs_expected_miscompare_sum_ = 0.0; - max_rel_err_ = 0.0; - max_abs_err_ = 0.0; - first_linear_index_ = -1; - last_linear_index_ = -1; - max_rel_linear_index_ = -1; - max_abs_linear_index_ = -1; - miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED)); - miscompares_.PopulateWithValue(false); - multi_index_.resize(expected.shape().dimensions_size(), 0); - - switch (expected.shape().element_type()) { - case BF16: - ExpectLiteralsNear(expected, actual, 0); - break; - case F16: - ExpectLiteralsNear(expected, actual, 0); - break; - case F32: - ExpectLiteralsNear(expected, actual, 0); - break; - case F64: - ExpectLiteralsNear(expected, actual, 0); - break; - case C64: - ExpectLiteralsNear(expected, actual, 0); - break; - default: - LOG(FATAL) << "Unsupported primitive type in near comparator: " - << PrimitiveType_Name(expected.shape().element_type()) - << ". Must be floating-point type."; - } - - if (num_miscompares_ > 0) { - if (!VLOG_IS_ON(1)) { - LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) - << " " << TruncateHugeLiteral(expected); - LOG(INFO) << "actual: " << ShapeUtil::HumanString(actual.shape()) - << " " << TruncateHugeLiteral(actual); - LOG(INFO) << "Dumping literals to temp files..."; - WriteLiteralToTempFile(expected, "expected"); - WriteLiteralToTempFile(actual, "actual"); - WriteLiteralToTempFile(miscompares_, "miscompares"); - } - EXPECT_TRUE(num_miscompares_ == 0) - << "\nmax relative mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), max_rel_linear_index_)) - << "\nmaximum relative error " << max_rel_err_ - << "\nmax absolute mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), max_abs_linear_index_)) - << "\nmaximum absolute error " << max_abs_err_ - << "\nfirst mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), first_linear_index_)) - << "\nlast mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), last_linear_index_)) - << "\ntotal absolute error " << abs_diff_sum_ - << "\ntotal absolute error of miscompares " - << abs_diff_miscompare_sum_ << "\ntotal relative error " - << (abs_diff_sum_ / abs_expected_sum_) - << "\ntotal relative error of miscompares " - << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_) - << "\nfailure count " << num_miscompares_; - } - return num_miscompares_ == 0; + // Compares the two array literals elementwise and returns an assertion + // result. The assertion result is successful if all actual and expected + // elements are within the given error bound. In case of error, the assertion + // result contains a detailed error message in case of failure. + static ::testing::AssertionResult Compare(const Literal& expected, + const Literal& actual, + ErrorSpec error, + bool detailed_message) { + NearComparator comparator(expected, actual, error, + detailed_message); + return comparator.Run(); } private: - template - bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) { - if (relaxed_nans) { - return !std::isnan(expected) && std::isnan(actual); - } else { - return std::isnan(expected) != std::isnan(actual); - } - } + // Data structure encapsulating metadata about a single element mismatch. + struct Mismatch { + NativeT actual; + NativeT expected; + float rel_error; + float abs_error; - template - void ExpectNear(NativeT expected, NativeT actual, - const ::testing::Message& message) { - EXPECT_NEAR(expected, actual, error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; - } + // The linear index of the failure within the shape. This linear index is + // from the 'actual' literal. + int64 linear_index; - // EXPECTs that the two given scalar values are within the error bound. Keeps - // track of how many mismatches have occurred to keep the size of the output - // manageable. - template - bool ExpectValuesNear(NativeT expected, NativeT actual) { - if (expected == actual) { - return true; + bool operator<(const Mismatch& other) const { + return rel_error < other.rel_error; } - const float abs_diff = std::abs(actual - expected); - const float rel_err = abs_diff / std::abs(expected); - const bool nan_mismatch = - NanMismatch(expected, actual, error_.relaxed_nans); - const bool mismatch = - (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel)); - return !mismatch; - } - - // Assumes that expected vs actual fail ExpectValuesNear. - template - void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual, - const Shape& shape, const int64 linear_index) { - const float abs_diff = std::abs(actual - expected); - const float rel_err = abs_diff / std::abs(expected); - abs_diff_sum_ += abs_diff; - abs_expected_sum_ += std::abs(expected); - if (rel_err > max_rel_err_ || std::isnan(rel_err)) { - max_rel_err_ = rel_err; - max_rel_linear_index_ = linear_index; - } - if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) { - max_abs_err_ = abs_diff; - max_abs_linear_index_ = linear_index; - } - if (VLOG_IS_ON(10)) { - VLOG(10) << tensorflow::strings::Printf( - "index %s abs_diff %f rel_err %f", + string ToString(const Shape& shape) const { + return Printf( + "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g", + FpValueToString(actual).c_str(), FpValueToString(expected).c_str(), LiteralTestUtil::MultiIndexAsString( IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index)) .c_str(), - abs_diff, rel_err); + rel_error, abs_error); } - abs_diff_miscompare_sum_ += abs_diff; - abs_expected_miscompare_sum_ += std::abs(expected); - const int64 kMaxFailures = 2; - if (num_miscompares_ < kMaxFailures) { - const auto multi_index = - IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index); - ::testing::Message msg; - msg << "mismatch at index " - << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff " - << abs_diff << " rel err " << rel_err << " failure #" - << num_miscompares_; - ExpectNear(expected, actual, msg); - } else if (num_miscompares_ == kMaxFailures) { - LOG(ERROR) << "reached max 'loud' failure count; silently proceeding..."; + }; + + explicit NearComparator(const Literal& expected, const Literal& actual, + ErrorSpec error, bool detailed_message) + : expected_(expected), + actual_(actual), + error_(error), + detailed_message_(detailed_message), + abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}), + abs_error_buckets_(kErrorBucketBounds.size(), 0), + rel_error_buckets_(kErrorBucketBounds.size(), 0) {} + + // Runs the comparison between expected and actual literals. + ::testing::AssertionResult Run() { + VLOG(1) << "expected:"; + XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_)); + VLOG(1) << "actual:"; + XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_)); + + // If the shapes mismatch, we simply fail the expectation instead of + // printing out data, as it's a type error rather than a value error. + ::testing::AssertionResult equal_shapes = + LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape()); + if (!equal_shapes) { + return equal_shapes; } - if (num_miscompares_ == 0) { - first_linear_index_ = linear_index; + if (!ShapeUtil::IsArray(expected_.shape())) { + return ::testing::AssertionFailure() << "Expected array shape"; } - num_miscompares_++; - last_linear_index_ = linear_index; - miscompares_.data()[linear_index] = true; + + mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED)); + mismatches_.PopulateWithValue(false); + + CompareLiterals(); + + if (num_mismatches_ == 0) { + return ::testing::AssertionSuccess(); + } else if (!VLOG_IS_ON(1)) { + LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape()) + << " " << TruncateHugeLiteral(expected_); + LOG(INFO) << "actual: " << ShapeUtil::HumanString(actual_.shape()) + << " " << TruncateHugeLiteral(actual_); + LOG(INFO) << "Dumping literals to temp files..."; + WriteLiteralToTempFile(expected_, "expected"); + WriteLiteralToTempFile(actual_, "actual"); + WriteLiteralToTempFile(mismatches_, "mismatches"); + } + return ::testing::AssertionFailure() << ErrorMessage(); } - // Recursive function which compares the two given literals elementwise. - template - void ExpectLiteralsNear(const Literal& expected, const Literal& actual, - int64 dimension) { - // Fast path optimization for the case were layouts match. - if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) { - tensorflow::gtl::ArraySlice expected_data = - expected.data(); - tensorflow::gtl::ArraySlice actual_data = - actual.data(); - const int64 len = expected_data.size(); - for (int64 i = 0; i < len; ++i) { - const bool near = ExpectValuesNear(expected_data[i], actual_data[i]); - if (!near) { - UpdateAndLogMiscompares(expected_data[i], actual_data[i], - actual.shape(), i); + // Insert the given absolute value into the absolute value bucket vector. The + // bounds of the buckets are given by kAbsValueBucketBounds. + void UpdateAbsValueBucket(NativeT value, bool is_mismatch) { + // Adjust the bucket containing the absolute values of the 'actual' + // elements. + const float abs_value = FpAbsoluteValue(value); + for (int i = 0; i < abs_value_buckets_.size(); ++i) { + if (i == abs_value_buckets_.size() - 1 || + (abs_value >= kAbsValueBucketBounds[i] && + abs_value < kAbsValueBucketBounds[i + 1])) { + // The first value of the pair is the count of elements in the bucket, + // the second is the count of mismatches in the bucket. + abs_value_buckets_[i].first++; + if (is_mismatch) { + abs_value_buckets_[i].second++; } + return; } + } + } + + // Insert the given error into the given error bucket vector. + void UpdateErrorBucket( + float error, tensorflow::gtl::MutableArraySlice error_buckets) { + CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size()); + for (int i = 0; i < error_buckets.size(); ++i) { + if (error >= kErrorBucketBounds[i]) { + error_buckets[i]++; + } + } + } + + // Compares the two given elements from the expected and actual literals at + // the given literal_index and keeps track of various mismatch statistics. + void CompareValues(NativeT expected, NativeT actual, int64 linear_index) { + const bool is_nan_mismatch = + NanMismatch(expected, actual, error_.relaxed_nans); + float abs_error; + float rel_error; + if (actual == expected) { + abs_error = 0; + rel_error = 0; + } else if (is_nan_mismatch) { + num_nan_mismatches_++; + // A nan mismatch is considered to have infinite error. rel_error is used + // for sorting a std::set of the top mismatchs, and a nan value here will + // result in undefined behavior because nan's do not satisfy the strict + // weak ordering requirement of std containers. + abs_error = std::numeric_limits::infinity(); + rel_error = std::numeric_limits::infinity(); + } else { + abs_error = FpAbsoluteValue(actual - expected); + rel_error = abs_error / FpAbsoluteValue(expected); + } + const bool is_abs_mismatch = abs_error > error_.abs; + const bool is_rel_mismatch = rel_error > error_.rel; + const bool is_mismatch = + is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch); + + // Update the error of the relative bucket only if the *absolute* error + // bound is exceeded and vice versa. + if (is_abs_mismatch) { + num_abs_mismatches_++; + UpdateErrorBucket(rel_error, &rel_error_buckets_); + } + if (is_rel_mismatch) { + num_rel_mismatches_++; + UpdateErrorBucket(abs_error, &abs_error_buckets_); + } + + UpdateAbsValueBucket(actual, is_mismatch); + + if (!is_mismatch) { return; } - if (dimension == expected.shape().dimensions_size()) { - bool near = ExpectValuesNear(expected.Get(multi_index_), - actual.Get(multi_index_)); - if (!near) { - UpdateAndLogMiscompares( - expected.Get(multi_index_), - actual.Get(multi_index_), actual.shape(), - IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(), - multi_index_)); + num_mismatches_++; + + // Keep track of the kTopRelativeErrorCount relative error mismatches. + if (top_rel_mismatches_.size() < kTopRelativeErrorCount || + rel_error > top_rel_mismatches_.begin()->rel_error) { + Mismatch mismatch = {actual, expected, rel_error, abs_error, + linear_index}; + top_rel_mismatches_.insert(mismatch); + if (top_rel_mismatches_.size() > kTopRelativeErrorCount) { + top_rel_mismatches_.erase(top_rel_mismatches_.begin()); } + } + + mismatches_.data()[linear_index] = true; + } + + // Compares the two literals elementwise. + void CompareLiterals() { + // Fast path optimization for the case were layouts match. + if (LayoutUtil::Equal(actual_.shape().layout(), + expected_.shape().layout())) { + tensorflow::gtl::ArraySlice expected_data = + expected_.data(); + tensorflow::gtl::ArraySlice actual_data = + actual_.data(); + const int64 len = expected_data.size(); + for (int64 i = 0; i < len; ++i) { + CompareValues(expected_data[i], actual_data[i], i); + } + return; + } + std::vector multi_index(ShapeUtil::Rank(actual_.shape()), 0); + CompareLiteralsSlow(0, &multi_index); + } + + // Slow path for CompareLiterals when 'actual' and 'expected' literals have + // different layouts. In this case, multidimensional indices are constructed + // and indexed for each element. + void CompareLiteralsSlow(int64 dimension, std::vector* multi_index) { + if (dimension == multi_index->size()) { + CompareValues(expected_.Get(*multi_index), + actual_.Get(*multi_index), + IndexUtil::MultidimensionalIndexToLinearIndex( + actual_.shape(), *multi_index)); } else { - for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) { - multi_index_[dimension] = i; - ExpectLiteralsNear(expected, actual, dimension + 1); + for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) { + (*multi_index)[dimension] = i; + CompareLiteralsSlow(dimension + 1, multi_index); } } } @@ -580,159 +653,247 @@ class NearComparator { int64 now_usec = tensorflow::Env::Default()->NowMicros(); string filename = tensorflow::io::JoinPath( tensorflow::testing::TmpDir(), - tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(), - now_usec, name.c_str())); + Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec, + name.c_str())); TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, literal.ToProto())); LOG(ERROR) << "wrote to " << name << " file: " << filename; } - // Gets the total element count. For tuples, this is not the count of tuple - // elements, but the sum of elements of each tuple element. - int64 RecursiveElementCount(const Shape& shape) { - if (ShapeUtil::IsTuple(shape)) { - const int64 tuple_elements = ShapeUtil::TupleElementCount(shape); - int64 total = 0; - for (int64 i = 0; i < tuple_elements; ++i) { - total += - RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i)); - } - return total; - } else { - return ShapeUtil::ElementsIn(shape); + // Returns an error message string with a detailed breakdown of the + // mismatches. Called after calling Run(). + string ErrorMessage() { + string out; + int64 element_count = ShapeUtil::ElementsIn(actual_.shape()); + + auto percent_string = [](float a, float b) { + float pct = b == 0.0 ? 0.0 : 100.0 * a / b; + return Printf("%0.4f%%", pct); + }; + + Appendf(&out, + "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound " + "%g, rel bound %g\n", + num_mismatches_, + percent_string(num_mismatches_, element_count).c_str(), + ShapeUtil::HumanString(actual_.shape()).c_str(), + ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); + if (num_nan_mismatches_ > 0) { + StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n"); } + Appendf(&out, "Top relative error mismatches:\n"); + for (auto it = top_rel_mismatches_.rbegin(); + it != top_rel_mismatches_.rend(); ++it) { + StrAppend(&out, " ", it->ToString(actual_.shape()).c_str(), "\n"); + } + + if (!detailed_message_) { + return out; + } + + StrAppend(&out, "Absolute magnitude breakdown of actual values:\n"); + CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size()); + for (int i = 0; i < abs_value_buckets_.size(); ++i) { + const int64 bucket_size = abs_value_buckets_[i].first; + const int64 bucket_mismatches = abs_value_buckets_[i].second; + string mismatch_str = bucket_mismatches > 0 + ? Printf(", mismatches %lld", bucket_mismatches) + : ""; + Appendf(&out, " %-6g <= x < %-6g : %7lld (%9s)%s\n", + kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], + bucket_size, percent_string(bucket_size, element_count).c_str(), + mismatch_str.c_str()); + } + + auto print_accum_buckets = [&](const string& header, int64 total, + tensorflow::gtl::ArraySlice buckets) { + StrAppend(&out, header, ":\n"); + Appendf(&out, " < %-6g : %7lld (%s)\n", kErrorBucketBounds[0], + total - buckets[0], + percent_string(total - buckets[0], total).c_str()); + CHECK_EQ(buckets.size(), kErrorBucketBounds.size()); + for (int i = 0; i < kErrorBucketBounds.size(); ++i) { + Appendf(&out, " >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i], + buckets[i], percent_string(buckets[i], total).c_str()); + } + }; + Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n", + error_.abs, num_abs_mismatches_, + percent_string(num_abs_mismatches_, element_count).c_str()); + print_accum_buckets( + "Relative error breakdown of elements exceeding abs error bound", + num_abs_mismatches_, rel_error_buckets_); + Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n", + error_.rel, num_rel_mismatches_, + percent_string(num_rel_mismatches_, element_count).c_str()); + print_accum_buckets( + "Absolute error breakdown of elements exceeding rel error bound", + num_rel_mismatches_, abs_error_buckets_); + return out; } - // Calling ToString on a literal with over 100 million elements takes around - // 3 minutes. The utility of printing a literal with >1000 elements is - // questionable, especially when writing the Literal proto to disk is orders - // of magnitude faster. - string TruncateHugeLiteral(const Literal& literal) { - return RecursiveElementCount(literal.shape()) < 1000 - ? literal.ToString() - : "[TRUNCATED, Literal with more than 1000 values]"; - } + // 'actual' and 'expected' literals being compared. + const Literal& expected_; + const Literal& actual_; + // The error bounds of the comparison. ErrorSpec error_; - // Number of element miscomparisons encountered so far. - int64 num_miscompares_; + // Whether to include detailed breakdown of mismatches in the error message. + bool detailed_message_; + + // Number of element element mismatches encountered so far. + int64 num_mismatches_ = 0; + + // Number of elements with a nan mismatch. + int64 num_nan_mismatches_ = 0; + + // Number of elements which exceed the absolute/relative error bound. + int64 num_abs_mismatches_ = 0; + int64 num_rel_mismatches_ = 0; // A Literal containing which elements did not match in the expected and - // actual literals. miscompares_ contains PREDs and is of the same sizes as + // actual literals. mismatches_ contains PREDs and is of the same sizes as // the comparison literals. - Literal miscompares_; + Literal mismatches_; - // A multidimensional index used when performing the recursive comparison. - std::vector multi_index_; + // The number of mismatches to report in the output, sorted by relative error + // magnitude. + static constexpr int64 kTopRelativeErrorCount = 5; - // Aggregated Statistics on input. - double abs_diff_sum_; - double abs_expected_sum_; - double abs_diff_miscompare_sum_; - double abs_expected_miscompare_sum_; - float max_rel_err_; - float max_abs_err_; - int64 first_linear_index_; - int64 last_linear_index_; - int64 max_rel_linear_index_; - int64 max_abs_linear_index_; + // The set of mismatches with the largest relative error. The size of this set + // is bounded by kTopRelativeErrorCount. + std::multiset top_rel_mismatches_; + + // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the + // bounds of these buckets. abs_value_buckets_ contains a pair for each + // bucket: the element count and failure count. + static constexpr std::array kAbsValueBucketBounds = { + 0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits::infinity()}; + std::vector> abs_value_buckets_; + + // Buckets for relative and absolute errors. The relative error buckets only + // contains those elements which exceed the *absolute* error bound, and vice + // versa. This makes it easy to see the effect of adjusting the relative (or + // absolute) error bound on the success of the comparison. kErrorBucketBounds + // are the lower bounds of the buckets in both vectors. The error buckets are + // a cumulative distribution so an error value may appear in more than one + // bucket. For example an error value of 0.003 may appear in the buckets + // bounded by 0.01, 0.1, and 1.0. + static constexpr std::array kErrorBucketBounds = {0.0001, 0.001, + 0.01, 0.1, 1}; + std::vector abs_error_buckets_; + std::vector rel_error_buckets_; }; -template <> -bool NearComparator::NanMismatch(complex64 expected, - complex64 actual, - bool relaxed_nans) { - return NanMismatch(expected.real(), actual.real(), relaxed_nans) || - NanMismatch(expected.imag(), actual.imag(), relaxed_nans); -} +template +constexpr std::array NearComparator::kAbsValueBucketBounds; +template +constexpr std::array NearComparator::kErrorBucketBounds; -template <> -void NearComparator::ExpectNear(complex64 expected, complex64 actual, - const ::testing::Message& message) { - EXPECT_NEAR(expected.real(), actual.real(), error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; - EXPECT_NEAR(expected.imag(), actual.imag(), error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; -} - -template <> -bool NearComparator::ExpectValuesNear(bfloat16 expected, - bfloat16 actual) { - return ExpectValuesNear(static_cast(expected), - static_cast(actual)); -} - -template <> -bool NearComparator::ExpectValuesNear(half expected, half actual) { - return ExpectValuesNear(static_cast(std::move(expected)), - static_cast(std::move(actual))); -} - -template <> -void NearComparator::UpdateAndLogMiscompares( - const bfloat16 expected, const bfloat16 actual, const Shape& shape, - const int64 linear_index) { - UpdateAndLogMiscompares(static_cast(expected), - static_cast(actual), shape, linear_index); -} - -template <> -void NearComparator::UpdateAndLogMiscompares(half expected, half actual, - const Shape& shape, - const int64 linear_index) { - UpdateAndLogMiscompares(static_cast(std::move(expected)), - static_cast(std::move(actual)), shape, - linear_index); -} - -} // namespace - -/* static */ ::testing::AssertionResult LiteralTestUtil::Near( - const Literal& expected, const Literal& actual, const ErrorSpec& error) { +// Helper function for comparing two literals for nearness. Handles tuple-shapes +// via recursion. shape_index is the ShapeIndex of expected (or actual) +// currently being compared. +::testing::AssertionResult NearHelper(const Literal& expected, + const Literal& actual, + const ErrorSpec& error, + bool detailed_message, + const ShapeIndex& shape_index) { ::testing::AssertionResult err = - EqualShapes(expected.shape(), actual.shape()); + LiteralTestUtil::EqualShapes(expected.shape(), actual.shape()); if (!err) { return err; } if (ShapeUtil::IsTuple(expected.shape())) { for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { - SCOPED_TRACE(tensorflow::strings::StrCat( - "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape()))); const auto expected_element = LiteralView::Create(expected, {i}); const auto actual_element = LiteralView::Create(actual, {i}); - + ShapeIndex element_index = shape_index; + element_index.push_back(i); ::testing::AssertionResult res = - Near(expected_element, actual_element, error); - if (err && !res) { - err = res; + NearHelper(expected_element, actual_element, error, detailed_message, + element_index); + if (!res) { + string err_message = + Printf("\nArray at shape index %s%s", + element_index.ToString().c_str(), res.message()); + if (err) { + err = ::testing::AssertionFailure() << err_message; + } else { + err << err_message; + } } } + if (!err && shape_index.empty()) { + // Emit a top-level error message containing the top-level shape in case + // of mismatch. + int64 total_elements = RecursiveElementCount(actual.shape()); + err = ::testing::AssertionFailure() + << Printf("\nMismatches in shape %s (%lld elements):\n%s", + ShapeUtil::HumanString(actual.shape()).c_str(), + total_elements, err.message()); + } return err; } if (ShapeUtil::ElementIsFloating(expected.shape()) || ShapeUtil::ElementIsComplex(expected.shape())) { - NearComparator comparator(error); - return comparator.ExpectNear(expected, actual) - ? ::testing::AssertionSuccess() - : ::testing::AssertionFailure() << "values were not near"; + switch (expected.shape().element_type()) { + case BF16: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F16: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F32: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F64: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case C64: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + default: + LOG(FATAL) << "Unsupported primitive type in near comparator: " + << PrimitiveType_Name(expected.shape().element_type()) + << ". Must be floating-point type."; + } } - return Equal(expected, actual); + // Non-floating point literal. + return LiteralTestUtil::Equal(expected, actual); +} + +} // namespace + +/* static */ ::testing::AssertionResult LiteralTestUtil::Near( + const Literal& expected, const Literal& actual, const ErrorSpec& error, + bool detailed_message) { + return NearHelper(expected, actual, error, detailed_message, + /*shape_index=*/{}); } /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected, const Literal& actual, const ErrorSpec& error, const string& message) { - EXPECT_TRUE(Near(expected, actual, error)) - << (message.empty() - ? "" - : tensorflow::strings::StrCat("\nmessage: ", message)); + ::testing::AssertionResult res = + Near(expected, actual, error, /*detailed_message=*/false); + if (!res) { + res << "Expected: " << TruncateHugeLiteral(expected) << "\n"; + res << "Actual: " << TruncateHugeLiteral(actual) << "\n"; + if (!message.empty()) { + res << StrCat("\nmessage: ", message); + } + } + EXPECT_TRUE(res); } /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual( @@ -754,8 +915,7 @@ void NearComparator::UpdateAndLogMiscompares(half expected, half actual, /* static */ string LiteralTestUtil::MultiIndexAsString( tensorflow::gtl::ArraySlice multi_index) { - return tensorflow::strings::StrCat( - "{", tensorflow::str_util::Join(multi_index, ","), "}"); + return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}"); } /* static */ std::unique_ptr LiteralTestUtil::Reshape( diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h index 7b757a4bd7e..a755568c0f0 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.h +++ b/tensorflow/compiler/xla/tests/literal_test_util.h @@ -122,16 +122,19 @@ class LiteralTestUtil { // bounds are equivalent. // // Tuples are matched recursively. When comparing tensors of - // non-floating-point type, checks for exact equality, ignoring the ErroSpec. + // non-floating-point type, checks for exact equality, ignoring the ErrorSpec. // // If the shape of the literals is neither a complex/floating-point tensor nor // a tuple which contains a complex/floating-point tensor, Near() is // equivalent to Equal(). We don't raise an error in this case, because we // want to allow callers to call Near() even if they have no preconceptions // about the shapes being compared. + // + // If detailed_message is true, then the error message in the assertion result + // will contain a more detailed breakdown of mismatches. static ::testing::AssertionResult Near( - const Literal& expected, const Literal& actual, - const ErrorSpec& error) TF_MUST_USE_RESULT; + const Literal& expected, const Literal& actual, const ErrorSpec& error, + bool detailed_message = false) TF_MUST_USE_RESULT; // Expects expected and actual to be Near with the given error. static void ExpectNear(const Literal& expected, const Literal& actual, diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc index 3a421f84582..9d619a77c7e 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc @@ -89,7 +89,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { EXPECT_EQ("2", literal->ToString()); } else if (result.find("actual") != string::npos) { EXPECT_EQ("4", literal->ToString()); - } else if (result.find("miscompares") != string::npos) { + } else if (result.find("mismatches") != string::npos) { EXPECT_EQ("true", literal->ToString()); } else { FAIL() << "unknown file in temporary directory: " << result; From 35543d5777b87c18b47eb73e83af41240a022e26 Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 02:49:58 +0300 Subject: [PATCH 0471/1734] [tf.data] Correct / clarify docstring for `weights` as a dataset. This is a noop. --- tensorflow/contrib/data/python/ops/interleave_ops.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py index 5ae1fa9e9e1..812a50ecbf1 100644 --- a/tensorflow/contrib/data/python/ops/interleave_ops.py +++ b/tensorflow/contrib/data/python/ops/interleave_ops.py @@ -200,10 +200,11 @@ def sample_from_datasets(datasets, weights=None, seed=None): Args: datasets: A list of @{tf.data.Dataset} objects with compatible structure. - weights: (Optional.) A list of `len(datasets)` floating-point values or a - @{tf.data.Dataset} object, where `weights[i]` represents the probability - with which an element should be sampled from `datasets[i]`. Defaults to a - uniform distribution across `datasets`. + weights: (Optional.) A list of `len(datasets)` floating-point values where + `weights[i]` represents the probability with which an element should be + sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each + element is such a list. Defaults to a uniform distribution across + `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See @{tf.set_random_seed} for behavior. From e07c9e23a94866966aa7e336a519b55931d570e3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 16:53:14 -0700 Subject: [PATCH 0472/1734] Run EvaluateNodes for ModelPruner test except for NoPruning. PiperOrigin-RevId: 193596812 --- tensorflow/core/grappler/optimizers/BUILD | 1 + .../grappler/optimizers/model_pruner_test.cc | 52 +++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 63492e1a7f2..a371186fe64 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -365,6 +365,7 @@ tf_cuda_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "//tensorflow/core/grappler:devices", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder", diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc index 2b12eadec96..cf5b990377f 100644 --- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc +++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/grappler/devices.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h" #include "tensorflow/core/grappler/utils.h" @@ -133,6 +134,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) { EXPECT_EQ(NodeName(b.name()), new_d.input(0)); EXPECT_EQ(1, new_c.input_size()); EXPECT_EQ(NodeName(b.name()), new_c.input(0)); + + std::vector fetch = {"e"}; + auto expected_tensors = EvaluateNodes(item.graph, fetch); + auto actual_tensors = EvaluateNodes(output, fetch); + EXPECT_EQ(1, expected_tensors.size()); + EXPECT_EQ(1, actual_tensors.size()); + test::ExpectTensorEqual(expected_tensors[0], actual_tensors[0]); } TEST_F(ModelPrunerTest, NoOpPruning) { @@ -171,6 +179,13 @@ TEST_F(ModelPrunerTest, NoOpPruning) { EXPECT_EQ("a", new_node.input(0)); } } + + std::vector fetch = {"e"}; + auto expected_tensors = EvaluateNodes(item.graph, fetch); + auto actual_tensors = EvaluateNodes(output, fetch); + EXPECT_EQ(1, expected_tensors.size()); + EXPECT_EQ(1, actual_tensors.size()); + test::ExpectTensorEqual(expected_tensors[0], actual_tensors[0]); } TEST_F(ModelPrunerTest, PreserveIdentities) { @@ -201,6 +216,19 @@ TEST_F(ModelPrunerTest, PreserveIdentities) { TF_EXPECT_OK(status); EXPECT_EQ(item.graph.node_size(), output.node_size()); + + auto v_in_t = GenerateRandomTensor(TensorShape({3})); + Tensor v_ctrl_t(DT_BOOL, TensorShape({})); + v_ctrl_t.flat()(0) = true; + auto expected_tensors = EvaluateNodes( + item.graph, {"merge", "id2"}, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + auto actual_tensors = EvaluateNodes(output, {"merge", "id2"}, + {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}}); + EXPECT_EQ(2, expected_tensors.size()); + EXPECT_EQ(2, actual_tensors.size()); + for (int i = 0; i < expected_tensors.size(); i++) { + test::ExpectTensorEqual(expected_tensors[i], actual_tensors[i]); + } } TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) { @@ -241,6 +269,14 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) { EXPECT_EQ("b", new_c.input(0)); EXPECT_EQ("b", new_d.input(0)); EXPECT_EQ("b", new_e.input(0)); + + std::vector fetch = {"e"}; + auto a_t = GenerateRandomTensor(TensorShape({})); + auto expected_tensors = EvaluateNodes(item.graph, fetch, {{"a", a_t}}); + auto actual_tensors = EvaluateNodes(output, fetch, {{"a", a_t}}); + EXPECT_EQ(1, expected_tensors.size()); + EXPECT_EQ(1, actual_tensors.size()); + test::ExpectTensorEqual(expected_tensors[0], actual_tensors[0]); } // TODO(rmlarsen): Reenable this test when the issues with @@ -316,6 +352,12 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) { EXPECT_EQ(NodeName(b.name()), new_b.name()); const NodeDef& new_c = output.node(2); EXPECT_EQ(NodeName(c.name()), new_c.name()); + + auto expected_tensors = EvaluateNodes(item.graph, item.fetch); + auto actual_tensors = EvaluateNodes(output, item.fetch); + EXPECT_EQ(1, expected_tensors.size()); + EXPECT_EQ(1, actual_tensors.size()); + test::ExpectTensorEqual(expected_tensors[0], actual_tensors[0]); } TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) { @@ -348,6 +390,16 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) { EXPECT_EQ("c", node.input(0)); } } + if (GetNumAvailableGPUs() > 0) { + auto expected_tensors = EvaluateNodes(item.graph, item.fetch); + auto actual_tensors = EvaluateNodes(output, item.fetch); + EXPECT_EQ(4, expected_tensors.size()); + EXPECT_EQ(4, actual_tensors.size()); + for (int i = 0; i < expected_tensors.size(); i++) { + test::ExpectTensorNear(expected_tensors[i], actual_tensors[i], + 1e-6); + } + } } } // namespace From 2d8da1d12a5fbeaa99e1cdd761b735a02020611b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 17:17:05 -0700 Subject: [PATCH 0473/1734] Removed deprecated methods from tensorflow::StringPiece. This will allow tensorflow::StringPiece to be more easily replaced with absl::string_view as absl::string_view does not contain those methods. PiperOrigin-RevId: 193599651 --- tensorflow/core/lib/core/stringpiece.cc | 4 --- tensorflow/core/lib/core/stringpiece.h | 26 -------------------- tensorflow/core/lib/core/stringpiece_test.cc | 10 -------- 3 files changed, 40 deletions(-) diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc index 0b006fa2b46..4c488066e4b 100644 --- a/tensorflow/core/lib/core/stringpiece.cc +++ b/tensorflow/core/lib/core/stringpiece.cc @@ -25,10 +25,6 @@ std::ostream& operator<<(std::ostream& o, StringPiece piece) { return o; } -bool StringPiece::contains(StringPiece s) const { - return std::search(begin(), end(), s.begin(), s.end()) != end(); -} - size_t StringPiece::find(char c, size_t pos) const { if (pos >= size_) { return npos; diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h index 835b938cbfd..0cf6c248509 100644 --- a/tensorflow/core/lib/core/stringpiece.h +++ b/tensorflow/core/lib/core/stringpiece.h @@ -88,20 +88,6 @@ class StringPiece { size_t find(char c, size_t pos = 0) const; size_t rfind(char c, size_t pos = npos) const; - // DEPRECATED: Use tensorflow::str_util::StrContains instead. - bool contains(StringPiece s) const; - - // Checks whether StringPiece starts with x and if so advances the beginning - // of it to past the match. It's basically a shortcut for starts_with - // followed by remove_prefix. - // DEPRECATED: Use tensorflow::str_util::ConsumePrefix instead. - bool Consume(StringPiece x) { - if (starts_with(x)) { - remove_prefix(x.size_); - return true; - } - return false; - } StringPiece substr(size_t pos, size_t n = npos) const; @@ -114,18 +100,6 @@ class StringPiece { // > 0 iff "*this" > "b" int compare(StringPiece b) const; - // Return true iff "x" is a prefix of "*this" - // DEPRECATED: Use tensorflow::str_util::StartsWith instead. - bool starts_with(StringPiece x) const { - return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0)); - } - // Return true iff "x" is a suffix of "*this" - // DEPRECATED: Use tensorflow::str_util::EndsWith instead. - bool ends_with(StringPiece x) const { - return ((size_ >= x.size_) && - (memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0)); - } - private: const char* data_; size_t size_; diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc index d0dbeb6072c..de35d6eac6e 100644 --- a/tensorflow/core/lib/core/stringpiece_test.cc +++ b/tensorflow/core/lib/core/stringpiece_test.cc @@ -55,14 +55,4 @@ TEST(StringPiece, Ctor) { } } -TEST(StringPiece, Contains) { - StringPiece a("abcdefg"); - StringPiece b("abcd"); - StringPiece c("efg"); - StringPiece d("gh"); - EXPECT_TRUE(a.contains(b)); - EXPECT_TRUE(a.contains(c)); - EXPECT_TRUE(!a.contains(d)); -} - } // namespace tensorflow From 4e17a3f1496b398afe632b002b0589b7346b2e3f Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 19 Apr 2018 17:18:10 -0700 Subject: [PATCH 0474/1734] [XLA] De-unique_ptr-ify ShapedBuffer and ScopedShapedBuffer. These are already notionally equivalent to T* and unique_ptr, so having a unique_ptr of a {Scoped,}ShapedBuffer is pretty redundant. Also clean up the ScopedShapedBuffer API a bit. PiperOrigin-RevId: 193599773 --- tensorflow/compiler/jit/xla_launch_util.cc | 47 ++--- tensorflow/compiler/jit/xla_launch_util.h | 2 +- tensorflow/compiler/jit/xla_tensor.cc | 6 +- tensorflow/compiler/jit/xla_tensor.h | 6 +- .../compiler/xla/client/local_client.cc | 23 ++- tensorflow/compiler/xla/client/local_client.h | 6 +- .../xla/python/local_computation_builder.cc | 46 ++--- .../xla/python/local_computation_builder.h | 6 +- .../xla/service/allocation_tracker.cc | 33 ++-- .../compiler/xla/service/allocation_tracker.h | 14 +- .../xla/service/cpu/cpu_executable.cc | 14 +- .../compiler/xla/service/cpu/cpu_executable.h | 8 +- .../service/cpu/parallel_cpu_executable.cc | 9 +- .../xla/service/cpu/parallel_cpu_executable.h | 4 +- tensorflow/compiler/xla/service/executable.cc | 16 +- tensorflow/compiler/xla/service/executable.h | 8 +- .../xla/service/gpu/gpu_executable.cc | 10 +- .../compiler/xla/service/gpu/gpu_executable.h | 4 +- tensorflow/compiler/xla/service/hlo_runner.cc | 45 +++-- .../xla/service/interpreter/executable.cc | 9 +- .../xla/service/interpreter/executable.h | 4 +- tensorflow/compiler/xla/service/service.cc | 14 +- .../compiler/xla/service/shaped_buffer.cc | 36 ++-- .../compiler/xla/service/shaped_buffer.h | 64 ++++--- .../compiler/xla/service/transfer_manager.cc | 21 ++- .../compiler/xla/service/transfer_manager.h | 8 +- .../compiler/xla/tests/dynamic_ops_test.cc | 8 +- tensorflow/compiler/xla/tests/fusion_test.cc | 16 +- .../xla/tests/local_client_allocation_test.cc | 7 +- .../xla/tests/local_client_execute_test.cc | 170 ++++++++---------- .../xla/tests/local_client_test_base.cc | 12 +- .../xla/tests/local_client_test_base.h | 11 +- .../xla/tests/transfer_manager_test.cc | 46 ++--- .../xla/tests/xla_hlo_profile_test.cc | 10 +- 34 files changed, 373 insertions(+), 370 deletions(-) diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 50b0061d692..3520501c1a3 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -32,10 +32,13 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/util/stream_executor_util.h" +namespace { namespace gpu = perftools::gputools; +using xla::ScopedShapedBuffer; +using xla::ShapedBuffer; +} // anonymous namespace namespace tensorflow { - std::map SnapshotResourceVariables(OpKernelContext* ctx, int num_variables) { std::map snapshot; @@ -80,17 +83,17 @@ namespace { // Return the 'index''th subtree of the given ShapedBuffer as a // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the // subtree, and sets the input's buffer pointers to nullptr for the subtree. -std::unique_ptr ExtractSubShapedBuffer( - xla::ShapedBuffer* shaped_buffer, int index, +ScopedShapedBuffer ExtractSubShapedBuffer( + ShapedBuffer* shaped_buffer, int index, xla::DeviceMemoryAllocator* allocator) { xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape( shaped_buffer->on_host_shape(), index); xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape( shaped_buffer->on_device_shape(), index); - xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape, - shaped_buffer->platform(), - shaped_buffer->device_ordinal()); + ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape, + shaped_buffer->platform(), + shaped_buffer->device_ordinal()); auto& shape_tree = shaped_buffer->buffers(); auto& sub_shape_tree = sub_shaped_buffer.buffers(); @@ -102,8 +105,7 @@ std::unique_ptr ExtractSubShapedBuffer( index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0); } } - return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator) - .ValueOrDie(); + return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator); } } // namespace @@ -118,10 +120,10 @@ XlaComputationLaunchContext::XlaComputationLaunchContext( void XlaComputationLaunchContext::PopulateInputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, const std::map& variables) { - // Build xla::ShapedBuffers that point directly to the Tensor buffers. + // Build ShapedBuffers that point directly to the Tensor buffers. arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1); arg_buffers_.resize(kernel->xla_input_shapes.size()); - arg_ptrs_ = std::vector(arg_buffers_.size()); + arg_ptrs_ = std::vector(arg_buffers_.size()); // Pass remaining parameters. const Tensor* t; @@ -140,8 +142,7 @@ void XlaComputationLaunchContext::PopulateInputs( if (xla::ShapeUtil::IsTuple(on_device_shape)) { const XlaTensor* xla_tensor = XlaTensor::FromTensor(t); CHECK(xla_tensor && xla_tensor->has_shaped_buffer()); - arg_ptrs_[i] = - const_cast(&xla_tensor->shaped_buffer()); + arg_ptrs_[i] = const_cast(&xla_tensor->shaped_buffer()); } else { CHECK(xla::ShapeUtil::Equal(shape, on_device_shape)) << "On-device shape " @@ -149,7 +150,7 @@ void XlaComputationLaunchContext::PopulateInputs( << " not the same as on-host shape " << xla::ShapeUtil::HumanStringWithLayout(shape); gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); - arg_buffers_[i] = xla::MakeUnique( + arg_buffers_[i] = xla::MakeUnique( /*on_host_shape=*/shape, /*on_device_shape=*/shape, client_->platform(), client_->default_device_ordinal()); arg_buffers_[i]->set_buffer(dmem, /*index=*/{}); @@ -160,15 +161,15 @@ void XlaComputationLaunchContext::PopulateInputs( void XlaComputationLaunchContext::PopulateOutputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, - std::unique_ptr output) { + ScopedShapedBuffer output) { gpu::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; // Computation output should always be a tuple. if (VLOG_IS_ON(2)) { - VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString(); + VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString(); VLOG(2) << "Result tuple shape (on device): " - << output->on_device_shape().DebugString(); + << output.on_device_shape().DebugString(); } CHECK_EQ(ctx->num_outputs(), kernel->outputs.size()); @@ -226,18 +227,18 @@ void XlaComputationLaunchContext::PopulateOutputs( const TensorShape& shape = kernel->outputs[i].shape; VLOG(2) << "Retval " << i << " shape " << shape.DebugString(); - gpu::DeviceMemoryBase buffer = output->buffer({output_num}); + gpu::DeviceMemoryBase buffer = output.buffer({output_num}); if (allocate_xla_tensors_) { Tensor* output_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); CHECK(xla_tensor); - xla_tensor->set_shaped_buffer( - ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_)); + xla_tensor->set_shaped_buffer(ScopedShapedBuffer( + ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( ctx->expected_output_dtype(i), shape, buffer, allocator); - output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); ctx->set_output(i, output_tensor); } ++output_num; @@ -257,7 +258,7 @@ void XlaComputationLaunchContext::PopulateOutputs( write.input_index >= 0 && write.input_index < ctx->num_inputs(), errors::Internal("Invalid input index for variable write.")); - gpu::DeviceMemoryBase buffer = output->buffer({output_num}); + gpu::DeviceMemoryBase buffer = output.buffer({output_num}); Var* variable = nullptr; // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, @@ -282,12 +283,12 @@ void XlaComputationLaunchContext::PopulateOutputs( XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor); CHECK(xla_tensor); xla_tensor->set_shaped_buffer( - ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_)); + ExtractSubShapedBuffer(&output, output_num, xla_allocator_)); *variable->tensor() = output_tensor; } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( write.type, write.shape, buffer, allocator); - output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); *variable->tensor() = output_tensor; } ++output_num; diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 14f70fe3589..26dcaa8a51d 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -87,7 +87,7 @@ class XlaComputationLaunchContext { // Given the XLA output in `output`, populate all outputs of `ctx`. void PopulateOutputs(OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, - std::unique_ptr output); + xla::ScopedShapedBuffer output); // Return the argument list. Only valid after PopulateInputs() has been // called. diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index 956328e6757..84b2835c406 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -65,10 +65,8 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape, device_ordinal, size, /*retry_on_failure=*/false)); } - TF_ASSIGN_OR_RETURN(auto scoped_buffer, - xla::ScopedShapedBuffer::MakeScoped( - &buffer, client->backend().memory_allocator())); - set_shaped_buffer(std::move(scoped_buffer)); + set_shaped_buffer(xla::ScopedShapedBuffer( + std::move(buffer), client->backend().memory_allocator())); return Status::OK(); } diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index 5ff2fb08f03..2334fd272be 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -64,9 +64,9 @@ class XlaTensor { return *shaped_buffer_; } // Mutates the TensorInfo to set the ShapedBuffer. - void set_shaped_buffer( - std::unique_ptr shaped_buffer) { - shaped_buffer_ = std::move(shaped_buffer); + void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) { + shaped_buffer_ = + xla::MakeUnique(std::move(shaped_buffer)); } // Some tensors on the device may have known values on the host. We use these diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index d951c44cb92..d0e945b70fd 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -134,7 +134,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( return Status::OK(); } -StatusOr> LocalExecutable::Run( +StatusOr LocalExecutable::Run( const tensorflow::gtl::ArraySlice arguments, ExecutableRunOptions run_options) { TF_RETURN_IF_ERROR( @@ -167,27 +167,26 @@ StatusOr> LocalExecutable::Run( return ExecuteAndDump(&service_options, arguments); } TF_ASSIGN_OR_RETURN( - std::unique_ptr result, + ShapedBuffer result, executable_->ExecuteOnStreamWrapper( &service_options, run_options.execution_profile(), arguments)); - return MakeUnique(std::move(*result), - run_options.allocator()); + return ScopedShapedBuffer(std::move(result), run_options.allocator()); } -StatusOr> LocalExecutable::ExecuteAndDump( +StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const tensorflow::gtl::ArraySlice arguments) { executable_->session_module()->set_execution_platform( backend_->platform()->Name()); TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module())); TF_ASSIGN_OR_RETURN( - std::unique_ptr result, + ShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(result.get(), executable_->session_module())); + TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module())); TF_RETURN_IF_ERROR(executable_->DumpSessionModule()); - return ScopedShapedBuffer::MakeScoped(result.get(), run_options->allocator()); + return ScopedShapedBuffer(std::move(result), run_options->allocator()); } tensorflow::Status LocalExecutable::RecordArguments( @@ -281,9 +280,9 @@ StatusOr> LocalClient::Compile( updated_options)); } -StatusOr> -LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal, - DeviceMemoryAllocator* allocator) { +StatusOr LocalClient::LiteralToShapedBuffer( + const Literal& literal, int device_ordinal, + DeviceMemoryAllocator* allocator) { if (allocator == nullptr) { allocator = backend().memory_allocator(); } @@ -293,7 +292,7 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal, TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, backend().stream_executor(device_ordinal)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, literal, *scoped_buffer)); + executor, literal, scoped_buffer)); return std::move(scoped_buffer); } diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 42812b936f2..f306c520ede 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -38,7 +38,7 @@ class LocalExecutable { public: // Run the compiled computation with the given arguments and options and // return the result. - StatusOr> Run( + StatusOr Run( const tensorflow::gtl::ArraySlice arguments, ExecutableRunOptions run_options); @@ -73,7 +73,7 @@ class LocalExecutable { // Records the computation in a SessionModule proto with the arguments used to // invoke it, and the result. Enabled by flag: --tla_dump_executions_to. - StatusOr> ExecuteAndDump( + StatusOr ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const tensorflow::gtl::ArraySlice arguments); @@ -136,7 +136,7 @@ class LocalClient : public Client { // ScopedShapedBuffer. If non-null the given memory allocator is used for // device memory allocation. If null, the default memory allocator for the // device is used. - StatusOr> LiteralToShapedBuffer( + StatusOr LiteralToShapedBuffer( const Literal& literal, int device_ordinal, DeviceMemoryAllocator* allocator = nullptr); diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 2bacc6a9142..24e17abbe06 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -89,17 +89,16 @@ StatusOr> TransferFromOutfeedLocalReplica( return client->TransferFromOutfeedLocal(shape, device_ordinal); } -LocalShapedBuffer::LocalShapedBuffer( - std::unique_ptr shaped_buffer) +LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer) : shaped_buffer_(std::move(shaped_buffer)) {} -const std::unique_ptr& LocalShapedBuffer::shaped_buffer() - const { - return shaped_buffer_; +const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const { + return &shaped_buffer_; } -static StatusOr> ToBuffer( - LocalClient* client, int device_ordinal, const Literal& arg) { +static StatusOr ToBuffer(LocalClient* client, + int device_ordinal, + const Literal& arg) { return client->LiteralToShapedBuffer(arg, device_ordinal, client->backend().memory_allocator()); } @@ -109,14 +108,15 @@ LocalShapedBuffer* LocalShapedBuffer::FromLiteral( const Literal& argument, const tensorflow::gtl::optional& shape_with_layout) { LocalClient* client = GetOrCreateLocalClient(); - std::unique_ptr buf; - if (shape_with_layout) { - std::unique_ptr relaid = - argument.Relayout(shape_with_layout.value()); - buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie(); - } else { - buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie(); - } + ScopedShapedBuffer buf = [&] { + if (shape_with_layout) { + std::unique_ptr relaid = + argument.Relayout(shape_with_layout.value()); + return ToBuffer(client, /*device_ordinal=*/0, *relaid) + .ConsumeValueOrDie(); + } + return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie(); + }(); return new LocalShapedBuffer(std::move(buf)); } @@ -158,14 +158,14 @@ StatusOr> CompiledLocalComputation::Execute( << device_ordinal; // Transfer arguments in - std::vector> scoped_buffers; + std::vector scoped_buffers; scoped_buffers.reserve(arguments.size()); for (int i = 0; i < arguments.size(); ++i) { const Literal& argument = arguments[i]; const tensorflow::gtl::optional& shape_with_layout = shapes_with_layout[i]; - StatusOr> pushed; + StatusOr pushed; if (shape_with_layout) { std::unique_ptr relaid = argument.Relayout(shape_with_layout.value()); @@ -185,7 +185,7 @@ StatusOr> CompiledLocalComputation::Execute( std::vector argument_buffers; argument_buffers.reserve(scoped_buffers.size()); for (auto& buffer : scoped_buffers) { - argument_buffers.push_back(buffer.get()); + argument_buffers.push_back(&buffer); } DeviceAssignment device_assignment = @@ -202,7 +202,7 @@ StatusOr> CompiledLocalComputation::Execute( options.set_intra_op_thread_pool( client->backend().eigen_intra_op_thread_pool_device()); options.set_device_assignment(&device_assignment); - StatusOr> result_buffer_status = + StatusOr result_buffer_status = executable_->Run(argument_buffers, options); if (!result_buffer_status.ok()) { results[replica] = result_buffer_status.status(); @@ -210,8 +210,8 @@ StatusOr> CompiledLocalComputation::Execute( } // Transfer result out - results[replica] = - client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie()); + results[replica] = client->ShapedBufferToLiteral( + std::move(result_buffer_status).ValueOrDie()); }); } } @@ -236,7 +236,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers( std::vector argument_buffers; argument_buffers.reserve(argument_handles.size()); for (auto& handle : argument_handles) { - argument_buffers.push_back(handle->shaped_buffer().get()); + argument_buffers.push_back(handle->shaped_buffer()); } // Execute @@ -245,7 +245,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers( options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool()); options.set_intra_op_thread_pool( client->backend().eigen_intra_op_thread_pool_device()); - std::unique_ptr result_buffer = + ScopedShapedBuffer result_buffer = executable_->Run(argument_buffers, options).ConsumeValueOrDie(); return new LocalShapedBuffer(std::move(result_buffer)); diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index 31046e60f11..e1048909ab2 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -62,12 +62,12 @@ class LocalShapedBuffer { static LocalShapedBuffer* FromLiteral( const Literal& argument, const tensorflow::gtl::optional& shape_with_layout); - LocalShapedBuffer(std::unique_ptr shaped_buffer); - const std::unique_ptr& shaped_buffer() const; + LocalShapedBuffer(ScopedShapedBuffer shaped_buffer); + const ScopedShapedBuffer* shaped_buffer() const; std::unique_ptr ToLiteral() const; private: - std::unique_ptr shaped_buffer_; + ScopedShapedBuffer shaped_buffer_; }; // Wraps a LocalExecutable produced by compiling a diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 359582a78c3..6bf65825cd0 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -31,52 +31,51 @@ limitations under the License. namespace xla { StatusOr AllocationTracker::Register( - std::unique_ptr shaped_buffer, const string& tag) { + ShapedBuffer shaped_buffer, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "Register"; - std::vector> replicated_buffers; + std::vector replicated_buffers; replicated_buffers.emplace_back(std::move(shaped_buffer)); return RegisterInternal(std::move(replicated_buffers), tag); } StatusOr AllocationTracker::RegisterReplicatedBuffers( - std::vector> replicated_buffers, - const string& tag) { + std::vector replicated_buffers, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "RegisterReplicatedBuffers"; return RegisterInternal(std::move(replicated_buffers), tag); } StatusOr AllocationTracker::RegisterInternal( - std::vector> replicated_buffers, - const string& tag) { + std::vector replicated_buffers, const string& tag) { VLOG(2) << "RegisterInternal(" << "tag: \"" << tag << "\" with " << replicated_buffers.size() << " shaped_buffers."; for (const auto& shaped_buffer : replicated_buffers) { - VLOG(2) << "shaped_buffer:" << *shaped_buffer; - if (shaped_buffer->platform() != backend_->platform()) { + VLOG(2) << "shaped_buffer:" << shaped_buffer; + if (shaped_buffer.platform() != backend_->platform()) { return InvalidArgument( "AllocationTracker for platform %s cannot register buffer from " "platform %s", backend_->platform()->Name().c_str(), - shaped_buffer->platform()->Name().c_str()); + shaped_buffer.platform()->Name().c_str()); } } int64 handle = next_handle_++; for (auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(), + ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(), [this, &shape_indices](const Shape& /*subshape*/, const ShapeIndex& index) { shape_indices.push_back(index); }); for (const ShapeIndex& index : shape_indices) { - AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index), - shaped_buffer->device_ordinal()); + AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index), + shaped_buffer.device_ordinal()); } - handle_to_shaped_buffers_[handle].emplace_back(std::move(shaped_buffer)); + handle_to_shaped_buffers_[handle].emplace_back( + MakeUnique(std::move(shaped_buffer))); } GlobalDataHandle result; @@ -146,13 +145,13 @@ StatusOr> AllocationTracker::DeconstructTuple( for (int i = 0; i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape()); ++i) { - auto element_buffer = MakeUnique( + auto element_buffer = ShapedBuffer( ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i), ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i), shaped_buffer->platform(), shaped_buffer->device_ordinal()); - element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}), - /*index=*/{}); - std::vector> replicated_buffers; + element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}), + /*index=*/{}); + std::vector replicated_buffers; replicated_buffers.emplace_back(std::move(element_buffer)); TF_ASSIGN_OR_RETURN( GlobalDataHandle element_handle, diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 60e93358efb..2bfcd537129 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -45,14 +45,13 @@ class AllocationTracker { // Registers a shaped buffer of device memory, and returns a corresponding // handle that can be used for talking to XLA clients. The given shaped buffer // will be treated as the buffer corresponding to the only replica. - StatusOr Register( - std::unique_ptr shaped_buffer, const string& tag); + StatusOr Register(ShapedBuffer shaped_buffer, + const string& tag); // Registers a vector of shaped buffers of device memory, one per replica, and // returns a corresponding handle that can be used for talking to XLA clients. StatusOr RegisterReplicatedBuffers( - std::vector> replicated_buffers, - const string& tag); + std::vector replicated_buffers, const string& tag); // Unregister the allocation for the given data handle. Status Unregister(const GlobalDataHandle& data); @@ -95,8 +94,8 @@ class AllocationTracker { // Internal helper which registers a vector of shaped buffers, one per // replica. StatusOr RegisterInternal( - std::vector> replicated_buffers, - const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + std::vector replicated_buffers, const string& tag) + EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Resets the shaped buffers corresponding to the given handle. Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -132,6 +131,9 @@ class AllocationTracker { // A map from data handle to a vector of shaped buffers that represent the // buffers for different replicas. + // + // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our + // public API returns pointers to them. tensorflow::gtl::FlatMap>> handle_to_shaped_buffers_ GUARDED_BY(mutex_); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index aee62a4935e..97e550abe44 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -243,18 +243,18 @@ static Status DeallocateTempBuffers( return Status::OK(); } -StatusOr> CpuExecutable::CreateResultShapedBuffer( +StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result) { se::Stream* stream = run_options->stream(); - auto result_buffer = MakeUnique( + ShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), stream->parent()->platform(), stream->parent()->device_ordinal()); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer which is returned to the caller. - TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus( + TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); // The points to set is unambiguous so the set should be a @@ -281,7 +281,7 @@ StatusOr> CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr> CpuExecutable::ExecuteOnStream( +StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -300,7 +300,7 @@ StatusOr> CpuExecutable::ExecuteOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - std::unique_ptr result_buffer, + ShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); // Free all buffers not in the result. @@ -310,7 +310,7 @@ StatusOr> CpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr> CpuExecutable::ExecuteAsyncOnStream( +StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { if (hlo_profiling_enabled()) { @@ -330,7 +330,7 @@ StatusOr> CpuExecutable::ExecuteAsyncOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - std::unique_ptr result_buffer, + ShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); LogLiveAddresses(buffers, buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index c3c2820c26c..06b6943cb5a 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,12 +55,12 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; @@ -102,13 +102,13 @@ class CpuExecutable : public Executable { tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); - // Create a ShapedBuffer for holding the result of the computation. The + // Creates a ShapedBuffer for holding the result of the computation. The // addresses (DeviceMemoryBases) are set according to buffer assignment. // 'buffers_in_result' should point to a vector of the same size as // 'allocated_buffers'. An element in buffers_in_result is set to true if the // corresponding buffer is live out of the computation (and thus contained in // the returned ShapedBuffer). - StatusOr> CreateResultShapedBuffer( + StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index 2d0f1d0be5f..a2bd4fa195b 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -447,7 +447,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions( return Status::OK(); } -StatusOr> ParallelCpuExecutable::ExecuteOnStream( +StatusOr ParallelCpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -459,7 +459,7 @@ StatusOr> ParallelCpuExecutable::ExecuteOnStream( DeviceMemoryAllocator* memory_allocator = run_options->allocator(); std::vector buffers(assignment_->Allocations().size()); - auto result_buffer = MakeUnique( + ShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), stream->parent()->platform(), stream->parent()->device_ordinal()); @@ -472,7 +472,7 @@ StatusOr> ParallelCpuExecutable::ExecuteOnStream( // Copy DeviceMemoryBase values which into the respective location in // ShapedBuffer which is returned to the caller. std::vector buffers_in_result(assignment_->Allocations().size(), false); - TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus( + TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); @@ -511,8 +511,7 @@ StatusOr> ParallelCpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr> -ParallelCpuExecutable::ExecuteAsyncOnStream( +StatusOr ParallelCpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h index d87ba57a1e4..5ce84fa9964 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h @@ -59,12 +59,12 @@ class ParallelCpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~ParallelCpuExecutable() override {} - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index caa46686be1..b097ef79cc6 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -29,18 +29,19 @@ using tensorflow::gtl::ArraySlice; namespace xla { -StatusOr>> -Executable::ExecuteOnStreams( +StatusOr> Executable::ExecuteOnStreams( ArraySlice run_options, ArraySlice> arguments) { TF_RET_CHECK(run_options.size() == arguments.size()); - std::vector> return_values(run_options.size()); + std::vector return_values; + return_values.reserve(run_options.size()); if (run_options.size() == 1) { - TF_ASSIGN_OR_RETURN(return_values[0], + TF_ASSIGN_OR_RETURN(auto rv, ExecuteOnStream(&run_options[0], arguments[0], /*hlo_execution_profile=*/nullptr)); + return_values.push_back(std::move(rv)); return std::move(return_values); } @@ -48,8 +49,9 @@ Executable::ExecuteOnStreams( // We cannot BlockHostUntilDone() on the already-launched executions in case // of error, since if the executions communicate, the initially launched // executions may never complete if not all executions are running. - TF_ASSIGN_OR_RETURN(return_values[i], + TF_ASSIGN_OR_RETURN(auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i])); + return_values.push_back(std::move(rv)); } for (const auto& options : run_options) { TF_RET_CHECK(options.stream() != nullptr); @@ -58,7 +60,7 @@ Executable::ExecuteOnStreams( return std::move(return_values); } -StatusOr> Executable::ExecuteOnStreamWrapper( +StatusOr Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, ArraySlice arguments) { se::Stream* stream = run_options->stream(); @@ -78,7 +80,7 @@ StatusOr> Executable::ExecuteOnStreamWrapper( &hlo_profile_index_map()) : nullptr; - StatusOr> return_value = + StatusOr return_value = ExecuteOnStream(run_options, arguments, profile_ptr.get()); TF_RETURN_IF_ERROR(return_value.status()); diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 6f4cd99767f..9c725f21d80 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -62,14 +62,14 @@ class Executable { // enabled. // // Returns a shaped buffer containing the result of the computation. - virtual StatusOr> ExecuteOnStream( + virtual StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteOnStream(), but this call is non-blocking and returns as // soon as all of the operations are enqueued for launch on the stream. - virtual StatusOr> ExecuteAsyncOnStream( + virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) = 0; @@ -77,7 +77,7 @@ class Executable { // streams. arguments[i] contains the arguments to the execution on // run_options[i]->stream() and the returned value is at index i of the // returned vector. - virtual StatusOr>> ExecuteOnStreams( + virtual StatusOr> ExecuteOnStreams( tensorflow::gtl::ArraySlice run_options, tensorflow::gtl::ArraySlice< @@ -97,7 +97,7 @@ class Executable { // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a // timer for the execution, sets up HLO profiling if enabled, and fills in the // given ExecutionProfile if non-null. - StatusOr> ExecuteOnStreamWrapper( + StatusOr ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, tensorflow::gtl::ArraySlice arguments); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 5676d4de8e3..62ce15bc59d 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } -StatusOr> GpuExecutable::ExecuteOnStream( +StatusOr GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -297,13 +297,13 @@ StatusOr> GpuExecutable::ExecuteOnStream( HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); - auto shaped_buffer = MakeUnique( - root->shape(), root->shape(), executor->platform(), device_ordinal); + auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(), + executor->platform(), device_ordinal); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer. std::set buffers_in_result; - TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus( + TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus( [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); @@ -335,7 +335,7 @@ StatusOr> GpuExecutable::ExecuteOnStream( return std::move(shaped_buffer); } -StatusOr> GpuExecutable::ExecuteAsyncOnStream( +StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index dcb3991f41a..361bc30b2f3 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -74,12 +74,12 @@ class GpuExecutable : public Executable { // ExecuteOnStream will fail if the compute capability of the stream doesn't // match the compute capability passed to this object's constructor. - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 171477299e4..df5ffd0b7d6 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -107,33 +107,35 @@ StatusOr> HloRunner::Execute( const ExecutableRunOptions& run_options = service_run_options.run_options(); // Copy arguments to device. - std::vector> argument_buffers; - std::vector argument_buffer_ptrs; + std::vector argument_buffers; for (Literal* argument : arguments) { TF_ASSIGN_OR_RETURN( - std::unique_ptr argument_buffer, + ScopedShapedBuffer argument_buffer, backend().transfer_manager()->AllocateScopedShapedBuffer( argument->shape(), run_options.allocator(), run_options.device_ordinal())); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - stream.parent(), *argument, *argument_buffer)); + stream.parent(), *argument, argument_buffer)); argument_buffers.push_back(std::move(argument_buffer)); - argument_buffer_ptrs.push_back(argument_buffers.back().get()); + } + + std::vector argument_buffer_ptrs; + argument_buffer_ptrs.reserve(argument_buffers.size()); + for (const auto& buf : argument_buffers) { + argument_buffer_ptrs.push_back(&buf); } TF_ASSIGN_OR_RETURN( - std::unique_ptr result, + ShapedBuffer result, executable->ExecuteOnStreamWrapper( &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs)); // Create a ScopedShapedBuffer of the result to manage deallocation. This will // deallocate all the device memory when it goes out of scope. - TF_ASSIGN_OR_RETURN( - std::unique_ptr scoped_result, - ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator())); + ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator()); auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice( - stream.parent(), *scoped_result); + stream.parent(), scoped_result); if (result_literal.ok()) { VLOG(4) << "Executed binary and got result: " << result_literal.ValueOrDie()->ToString(); @@ -155,7 +157,13 @@ StatusOr>> HloRunner::ExecuteReplicated( backend().computation_placer()->AssignDevices(options.num_replicas, 1)); std::vector> streams; std::vector service_run_options; - std::vector> argument_buffers; + + std::vector argument_buffers; + // This reserve() call is necessary for correctness, because + // argument_buffer_ptrs contains pointers into the elements of + // argument_buffers. + argument_buffers.reserve(options.num_replicas * options.arguments.size()); + // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are // no arguments. std::vector argument_buffer_ptrs( @@ -175,13 +183,13 @@ StatusOr>> HloRunner::ExecuteReplicated( // Copy arguments to device. for (const Literal* argument : options.arguments) { TF_ASSIGN_OR_RETURN( - std::unique_ptr argument_buffer, + ScopedShapedBuffer argument_buffer, backend().transfer_manager()->AllocateScopedShapedBuffer( argument->shape(), backend().memory_allocator(), device)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, *argument, *argument_buffer)); + executor, *argument, argument_buffer)); argument_buffers.push_back(std::move(argument_buffer)); - argument_buffer_ptrs[index++] = argument_buffers.back().get(); + argument_buffer_ptrs[index++] = &argument_buffers.back(); } argument_buffer_slices.emplace_back( &argument_buffer_ptrs[index - options.arguments.size()], @@ -240,19 +248,18 @@ StatusOr>> HloRunner::ExecuteReplicated( } LOG(INFO) << "Replicated execution started"; - TF_ASSIGN_OR_RETURN(std::vector> results, + TF_ASSIGN_OR_RETURN(std::vector results, executable->ExecuteOnStreams(service_run_options, argument_buffer_slices)); LOG(INFO) << "Replicated execution terminated"; std::vector> exec_results; for (int64 i = 0; i < options.num_replicas; ++i) { - TF_ASSIGN_OR_RETURN(std::unique_ptr result, - ScopedShapedBuffer::MakeScoped( - results[i].get(), backend().memory_allocator())); + ScopedShapedBuffer result(std::move(results[i]), + backend().memory_allocator()); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, backend().transfer_manager()->TransferLiteralFromDevice( - streams[i]->parent(), *result)); + streams[i]->parent(), result)); exec_results.push_back(std::move(literal)); } return std::move(exec_results); diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index acfa79ea750..6553000336b 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable( InterpreterExecutable::~InterpreterExecutable() {} -StatusOr> InterpreterExecutable::ExecuteOnStream( +StatusOr InterpreterExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -88,12 +88,12 @@ StatusOr> InterpreterExecutable::ExecuteOnStream( evaluator.Evaluate>(*computation, arg_literals)); // Transform the result literal back into a ShapedBuffer. - TF_ASSIGN_OR_RETURN(std::unique_ptr result, + TF_ASSIGN_OR_RETURN(ShapedBuffer result, transfer_manager->AllocateShapedBuffer( result_literal->shape(), run_options->allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice( - executor, *result_literal, *result)); + executor, *result_literal, result)); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -106,8 +106,7 @@ StatusOr> InterpreterExecutable::ExecuteOnStream( return std::move(result); } -StatusOr> -InterpreterExecutable::ExecuteAsyncOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { return tensorflow::errors::Unimplemented( diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index 410110a1adf..c825a9a368d 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable { InterpreterExecutable(std::unique_ptr hlo_module); ~InterpreterExecutable() override; - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 2df59c35564..39f3aefdf80 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -550,7 +550,7 @@ Service::ExecuteParallelAndRegisterResult( // Stream executors for the replicas of the current computation. TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i])); CHECK_EQ(replicas.size(), arguments[i].size()); - std::vector> result_buffers; + std::vector result_buffers; for (int64 replica = 0; replica < replicas.size(); ++replica) { TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, backend->BorrowStream(replicas[replica])); @@ -582,7 +582,7 @@ Service::ExecuteParallelAndRegisterResult( backend->StreamBorrower()); // Asynchronously launch the computation. - TF_ASSIGN_OR_RETURN(std::unique_ptr result, + TF_ASSIGN_OR_RETURN(ShapedBuffer result, executables[i]->ExecuteAsyncOnStream( &run_options, arguments[i][replica])); @@ -1234,7 +1234,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, streams.push_back(std::move(stream)); } - std::vector> result_buffers; + std::vector result_buffers; for (size_t i = 0; i < streams.size(); ++i) { const auto& stream = streams[i]; ExecutableRunOptions options; @@ -1247,7 +1247,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, ServiceExecutableRunOptions service_options( options, execute_backend_->StreamBorrower()); - TF_ASSIGN_OR_RETURN(std::unique_ptr this_result_buffer, + TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer, executable->ExecuteAsyncOnStream( &service_options, replicated_arguments[i])); @@ -1347,16 +1347,16 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg, } // Allocate memory in each replica and transfer the data to all replicas. - std::vector> replicated_buffers; + std::vector replicated_buffers; for (se::StreamExecutor* executor : replicas) { TF_ASSIGN_OR_RETURN( - std::unique_ptr shaped_buffer, + ShapedBuffer shaped_buffer, execute_backend_->transfer_manager()->AllocateShapedBuffer( shape, execute_backend_->memory_allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR( execute_backend_->transfer_manager()->TransferLiteralToDevice( - executor, *literal, *shaped_buffer)); + executor, *literal, shaped_buffer)); replicated_buffers.emplace_back(std::move(shaped_buffer)); } TF_ASSIGN_OR_RETURN(*result->mutable_data(), diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 10a2aa2b30f..0b5a383f6fe 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -66,6 +66,8 @@ ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) { return *this; } +ShapedBuffer::~ShapedBuffer() {} + void ShapedBuffer::clear() { for (auto& pair : buffers_) { // A default constructed DeviceMemoryBase is a null pointer. @@ -102,18 +104,6 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) { return out; } -/* static */ -StatusOr> ScopedShapedBuffer::MakeScoped( - ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) { - auto scoped_buffer = WrapUnique(new ScopedShapedBuffer( - shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(), - allocator, shaped_buffer->device_ordinal())); - scoped_buffer->buffers_ = shaped_buffer->buffers(); - shaped_buffer->clear(); - - return std::move(scoped_buffer); -} - ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, DeviceMemoryAllocator* allocator, @@ -126,7 +116,25 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer, DeviceMemoryAllocator* allocator) : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {} +ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s) + : ShapedBuffer(std::move(s)), allocator_(s.allocator_) { + // Null out s.allocator_ so it doesn't try to free anything in its destructor. + s.allocator_ = nullptr; +} + +ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) { + *static_cast(this) = std::move(static_cast(s)); + allocator_ = s.allocator_; + // Null out s.allocator_ so it doesn't try to free anything in its destructor. + s.allocator_ = nullptr; + return *this; +} + ScopedShapedBuffer::~ScopedShapedBuffer() { + // allocator_ will be null if we were moved-from. + if (allocator_ == nullptr) { + return; + } // Deallocate all non-null buffers. A buffer may appear in more than one spot // in the shape (eg, a tuple with a repeated element) so keep track of what // has been deallocated. @@ -142,8 +150,8 @@ ScopedShapedBuffer::~ScopedShapedBuffer() { } } -std::unique_ptr ScopedShapedBuffer::release() { - auto shaped_buffer = MakeUnique(std::move(*this)); +ShapedBuffer ScopedShapedBuffer::release() { + ShapedBuffer shaped_buffer(std::move(*this)); buffers_ = ShapeTree(); return shaped_buffer; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index 62ba8f27342..f1b0527474c 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -43,6 +43,14 @@ class ShapedBuffer { ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, const se::Platform* platform, int device_ordinal); + // Movable, but not copyable. + ShapedBuffer(ShapedBuffer&& s); + ShapedBuffer& operator=(ShapedBuffer&&); + ShapedBuffer(const ShapedBuffer&) = delete; + ShapedBuffer& operator=(const ShapedBuffer&) = delete; + + virtual ~ShapedBuffer(); + // Returns the shape of the on-host representation of the data held by this // ShapedBuffer. const Shape& on_host_shape() const { return on_host_shape_; } @@ -80,13 +88,7 @@ class ShapedBuffer { string ToString() const; - ShapedBuffer(ShapedBuffer&& s); - ShapedBuffer& operator=(ShapedBuffer&&); - protected: - ShapedBuffer(const ShapedBuffer&) = delete; - ShapedBuffer& operator=(const ShapedBuffer&) = delete; - // The shape of the data when represented on the host. Shape on_host_shape_; @@ -108,41 +110,45 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer); // ShapedBuffer derived class which allocates all internal buffers on // construction and deallocates the memory when the object is // destructed. +// +// TODO(timshen): Remove inheritance between ScopedShapedBuffer and +// ShapedBuffer. There should never be a need to consider a ScopedShapedBuffer +// as a ShapedBuffer, because in that case we should just be able to pass around +// our ShapeTree. Inheritance only adds complexity. See +// discussion in cl/192849370. class ScopedShapedBuffer : public ShapedBuffer { public: - // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the - // deallocation of the device memory held in the shaped buffer. All device - // memory pointers in the given ShapedBuffer are set to null. - static StatusOr> MakeScoped( - ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator); - - // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index. - ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - DeviceMemoryAllocator* allocator, int device_ordinal); + // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index. + explicit ScopedShapedBuffer(const Shape& on_host_shape, + const Shape& on_device_shape, + DeviceMemoryAllocator* allocator, + int device_ordinal); // Create a ScopedShapedBuffer by taking over the memory from the incoming // ShapedBuffer. - ScopedShapedBuffer(ShapedBuffer shaped_buffer, - DeviceMemoryAllocator* allocator); + explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer, + DeviceMemoryAllocator* allocator); + + // Movable, but not copyable. + ScopedShapedBuffer(ScopedShapedBuffer&& s); + ScopedShapedBuffer& operator=(ScopedShapedBuffer&&); + ScopedShapedBuffer(const ScopedShapedBuffer&) = delete; + ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete; + + // All buffers in the shape are deallocated on destruction. + ~ScopedShapedBuffer() override; // Return the allocator used to allocate the device memory held in this // ScopedShapedBuffer. DeviceMemoryAllocator* memory_allocator() const { return allocator_; } - // Release all device memory owned by this ScopedShapedBuffer and - // return the device memory pointers in the form of a - // ShapedBuffer. The returned ShapedBuffer takes over the memory - // from the ScopedShapedBuffer. The resulting ScopedShapedBuffer can - // only be destroyed. - std::unique_ptr release(); - - // All buffers in the shape are deallocated on destruction. - virtual ~ScopedShapedBuffer(); + // Releases all device memory owned by this ScopedShapedBuffer and returns the + // device memory pointers in the form of a ShapedBuffer. The returned + // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The + // resulting ScopedShapedBuffer can only be destroyed. + ShapedBuffer release(); protected: - ScopedShapedBuffer(const ScopedShapedBuffer&) = delete; - void operator=(const ScopedShapedBuffer&) = delete; - DeviceMemoryAllocator* allocator_; }; diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index be8231b73c0..98d0111d04d 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice( return Status::OK(); } -StatusOr> TransferManager::AllocateShapedBuffer( +StatusOr TransferManager::AllocateShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { @@ -187,31 +187,30 @@ StatusOr> TransferManager::AllocateShapedBuffer( const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape)); - auto shaped_buffer = WrapUnique(new ShapedBuffer( - on_host_shape, on_device_shape, allocator->platform(), device_ordinal)); + ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, + allocator->platform(), device_ordinal); // Allocate an appropriate sized buffer for each element in the shape // including the tuple pointer arrays. - for (auto& pair : shaped_buffer->buffers()) { + for (auto& pair : shaped_buffer.buffers()) { const ShapeIndex& index = pair.first; se::DeviceMemoryBase& memory_base = pair.second; const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index); TF_ASSIGN_OR_RETURN(memory_base, - allocator->Allocate(shaped_buffer->device_ordinal(), + allocator->Allocate(shaped_buffer.device_ordinal(), GetByteSizeRequirement(subshape))); } return std::move(shaped_buffer); } -StatusOr> -TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape, - DeviceMemoryAllocator* allocator, - int device_ordinal) { +StatusOr TransferManager::AllocateScopedShapedBuffer( + const Shape& on_host_shape, DeviceMemoryAllocator* allocator, + int device_ordinal) { TF_ASSIGN_OR_RETURN( - std::unique_ptr unscoped_buffer, + ShapedBuffer unscoped_buffer, AllocateShapedBuffer(on_host_shape, allocator, device_ordinal)); - return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator); + return ScopedShapedBuffer(std::move(unscoped_buffer), allocator); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 410d2af7af6..a6451c4bb11 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -107,10 +107,10 @@ class TransferManager { // Allocate a ShapedBuffer which can hold data with the given on-host // shape. The on-device shape may be different as indicated by // HostShapeToDeviceShape. - StatusOr> AllocateShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, - int device_ordinal); - StatusOr> AllocateScopedShapedBuffer( + StatusOr AllocateShapedBuffer(const Shape& on_host_shape, + DeviceMemoryAllocator* allocator, + int device_ordinal); + StatusOr AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal); diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 464b8cbebb1..021fbcedb99 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -735,11 +735,11 @@ void BM_DynamicSlice(int num_iters) { auto start_indices_literal = Literal::CreateR1({0, 1, 2, 3}); ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *start_indices_literal, *buffer)); + executors[device_ordinal], *start_indices_literal, buffer)); std::unique_ptr executable = client - ->Compile(computation, {&buffer->on_host_shape()}, + ->Compile(computation, {&buffer.on_host_shape()}, ExecutableBuildOptions()) .ConsumeValueOrDie(); @@ -748,14 +748,14 @@ void BM_DynamicSlice(int num_iters) { options.set_allocator(&allocator); const int kWarmups = 2; for (int i = 0; i < kWarmups; ++i) { - auto result = executable->Run({buffer.get()}, options); + auto result = executable->Run({&buffer}, options); ASSERT_TRUE(result.ok()); } // Run benchmark. tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = executable->Run({buffer.get()}, options); + auto result = executable->Run({&buffer}, options); ASSERT_TRUE(result.ok()); } } diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index ed16963b40b..c7f64d85609 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) { // Transfer literals to device. auto param0_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); - std::unique_ptr buffer0 = + ShapedBuffer buffer0 = client->LiteralToShapedBuffer(*param0_literal, device_ordinal) .ConsumeValueOrDie(); auto param1_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); - std::unique_ptr buffer1 = + ShapedBuffer buffer1 = client->LiteralToShapedBuffer(*param1_literal, device_ordinal) .ConsumeValueOrDie(); auto param2_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); - std::unique_ptr buffer2 = + ShapedBuffer buffer2 = client->LiteralToShapedBuffer(*param2_literal, device_ordinal) .ConsumeValueOrDie(); @@ -814,8 +814,8 @@ void BM_ParallelFusion(int num_iters) { std::unique_ptr executable = client ->Compile(computation, - {&buffer0->on_host_shape(), &buffer1->on_host_shape(), - &buffer2->on_host_shape()}, + {&buffer0.on_host_shape(), &buffer1.on_host_shape(), + &buffer2.on_host_shape()}, ExecutableBuildOptions()) .ConsumeValueOrDie(); @@ -836,8 +836,7 @@ void BM_ParallelFusion(int num_iters) { // Run some warm-up executions. const int kWarmups = 2; for (int i = 0; i < kWarmups; ++i) { - auto result = - executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options); + auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options); ASSERT_TRUE(result.ok()); } @@ -850,8 +849,7 @@ void BM_ParallelFusion(int num_iters) { tensorflow::testing::UseRealTime(); tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = - executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options); + auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options); ASSERT_TRUE(result.ok()); } } diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc index 3d30ceeaf1b..7209f91639b 100644 --- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -53,7 +54,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { // deallocation happen on the right allocator. ExecutableRunOptions options; options.set_allocator(allocator); - std::unique_ptr result = + tensorflow::gtl::optional result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), options); @@ -66,7 +67,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { // Deallocate result and verify that deallocate was called once. int64 deallocation_count_before = allocator_->deallocation_count(); - result = nullptr; + result.reset(); EXPECT_EQ(deallocation_count_before + 1, allocator_->deallocation_count()); } @@ -92,7 +93,7 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) { computation, {}, ExecutableBuildOptions().set_device_ordinal(d), ExecutableRunOptions().set_device_ordinal(d).set_allocator(allocator)); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); // At least one allocation should have been performed when executing the // computation. diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 373dd3c5df4..7e14e77366d 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -57,10 +57,9 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) { ComputationBuilder builder(local_client_, TestName()); auto y = builder.ConstantR0(123.0f); - std::unique_ptr result = + ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); - - LiteralTestUtil::ExpectR0Near(123.f, *ShapedBufferToLiteral(*result), + LiteralTestUtil::ExpectR0Near(123.f, *ShapedBufferToLiteral(result), error_spec_); } @@ -71,10 +70,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) { builder.Add(x, y); auto x_value = LiteralToShapedBuffer(*Literal::CreateR0(42.0f)); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_value.get()}); - - LiteralTestUtil::ExpectR0Near(165.f, *ShapedBufferToLiteral(*result), + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value}); + LiteralTestUtil::ExpectR0Near(165.f, *ShapedBufferToLiteral(result), error_spec_); } @@ -85,10 +83,9 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { builder.Add(x, y); auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({})); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()}); - - LiteralTestUtil::ExpectR1Near({}, *ShapedBufferToLiteral(*result), + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); + LiteralTestUtil::ExpectR1Near({}, *ShapedBufferToLiteral(result), error_spec_); } @@ -100,11 +97,10 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()}); - + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { @@ -116,13 +112,12 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); ExecutionProfile profile; - std::unique_ptr result = ExecuteLocallyOrDie( - builder.Build().ValueOrDie(), {x_array.get()}, - DefaultExecutableBuildOptions(), + ScopedShapedBuffer result = ExecuteLocallyOrDie( + builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions().set_execution_profile(&profile)); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); EXPECT_GT(profile.compute_and_transfer_time_ns(), 0); } @@ -136,27 +131,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { // Create x as a col-major array. auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}))); - EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1}))); // Create y as a row-major array. auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0}))); - EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0}))); - std::unique_ptr result_colmaj = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result_colmaj = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_colmaj), + *ShapedBufferToLiteral(result_colmaj), error_spec_); // Run with the parameter values in a different order. - std::unique_ptr result_param_swap = - ExecuteLocallyOrDie(computation, {y_array.get(), x_array.get()}); + ScopedShapedBuffer result_param_swap = + ExecuteLocallyOrDie(computation, {&y_array, &x_array}); LiteralTestUtil::ExpectR2Near( {{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_param_swap), error_spec_); + *ShapedBufferToLiteral(result_param_swap), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { @@ -172,27 +167,27 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); // Run with col-major result layout. - std::unique_ptr result_colmaj = ExecuteLocallyOrDie( - computation, {x_array.get(), y_array.get()}, + ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie( + computation, {&x_array, &y_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})), DefaultExecutableRunOptions()); - EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1}))); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_colmaj), + *ShapedBufferToLiteral(result_colmaj), error_spec_); // Run with row-major result layout. - std::unique_ptr result_rowmaj = ExecuteLocallyOrDie( - computation, {x_array.get(), y_array.get()}, + ScopedShapedBuffer result_rowmaj = ExecuteLocallyOrDie( + computation, {&x_array, &y_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})), DefaultExecutableRunOptions()); - EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0}))); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_rowmaj), + *ShapedBufferToLiteral(result_rowmaj), error_spec_); } @@ -208,13 +203,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) { auto y_array = LiteralToShapedBuffer( *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -237,13 +232,13 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { auto y_array = LiteralToShapedBuffer( *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1})); LiteralTestUtil::ExpectR2Equal( @@ -274,11 +269,11 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, /*minor_to_major=*/{1, 0})}); options.set_result_layout(shape_with_layout); - std::unique_ptr result = ExecuteLocallyOrDie( - builder.Build().ValueOrDie(), {array.get(), array.get()}, options, - DefaultExecutableRunOptions()); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&array, &array}, + options, DefaultExecutableRunOptions()); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -318,13 +313,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { auto x_buffer = LiteralToShapedBuffer(*x_literal); auto y_buffer = LiteralToShapedBuffer(*y_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{56.0f, 46.0f}, {36.0f, 26.0f}}, LiteralView::Create(*result_literal, {0})); @@ -363,10 +358,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { Literal::CreateR1({222.0, -2.0, 10.0}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR1Equal( @@ -394,18 +388,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { Literal::CreateR2({{11.0, 3.0}, {4.0, 5.0}}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result_0 = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_0_literal = ShapedBufferToLiteral(*result_0); + ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_0_literal = ShapedBufferToLiteral(result_0); LiteralTestUtil::ExpectR2Equal( {{-1.0, -2.0}, {-3.0, -4.0}}, LiteralView::Create(*result_0_literal, {0})); LiteralTestUtil::ExpectR2Equal( {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1})); - std::unique_ptr result_1 = - ExecuteLocallyOrDie(computation, {result_0.get()}); - std::unique_ptr result_1_literal = ShapedBufferToLiteral(*result_1); + ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0}); + std::unique_ptr result_1_literal = ShapedBufferToLiteral(result_1); LiteralTestUtil::ExpectR2Equal( {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -451,10 +443,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { Literal::MakeTupleOwned(std::move(arg_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); for (int i = 0; i < kElementCount; ++i) { LiteralTestUtil::ExpectR1Near( @@ -509,9 +499,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); for (int i = 0; i < kFanout; ++i) { for (int j = 0; j < kFanout; ++j) { @@ -554,9 +543,8 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { } auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); ShapeIndex index; for (int i = 0; i < kTupleDepth; ++i) { @@ -576,7 +564,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({1.0f, 2.0f, 3.0f})); auto execute_status = - ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()}); + ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); EXPECT_FALSE(execute_status.ok()); EXPECT_THAT(execute_status.status().error_message(), @@ -592,7 +580,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { auto x_array = LiteralToShapedBuffer( *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = - ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()}); + ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); EXPECT_FALSE(execute_status.ok()); EXPECT_THAT(execute_status.status().error_message(), @@ -609,7 +597,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { auto x_array = LiteralToShapedBuffer( *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = ExecuteLocally( - builder.Build().ValueOrDie(), {x_array.get()}, + builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{1, 2, 3, 4}, @@ -642,9 +630,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { computation, {}, DefaultExecutableBuildOptions().set_device_ordinal(d), DefaultExecutableRunOptions().set_device_ordinal(d)); - EXPECT_EQ(d, result->device_ordinal()); + EXPECT_EQ(d, result.device_ordinal()); LiteralTestUtil::ExpectR0Equal(42.0f, - *ShapedBufferToLiteral(*result)); + *ShapedBufferToLiteral(result)); } } } @@ -687,9 +675,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnStream) { DefaultExecutableRunOptions().set_stream(&stream)); // As a check to verify that the computation ran of the device associated // with the stream. This is a weak check, but stronger verification is hard. - EXPECT_EQ(d, result->device_ordinal()); + EXPECT_EQ(d, result.device_ordinal()); LiteralTestUtil::ExpectR0Equal(42.0f, - *ShapedBufferToLiteral(*result)); + *ShapedBufferToLiteral(result)); } } @@ -765,9 +753,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); builder.Select(builder.ConstantR0(false), tuple12, tuple21); - std::unique_ptr result = + ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); - std::unique_ptr tuple_literal = ShapedBufferToLiteral(*result); + std::unique_ptr tuple_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR1Equal( {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0})); LiteralTestUtil::ExpectR1Equal( @@ -791,12 +779,12 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); - std::unique_ptr result = - executable->Run({x_array.get()}, DefaultExecutableRunOptions()) + ScopedShapedBuffer result = + executable->Run({&x_array}, DefaultExecutableRunOptions()) .ConsumeValueOrDie(); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { @@ -809,7 +797,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { literal, local_client_->default_device_ordinal(), allocator_)); TF_ASSERT_OK_AND_ASSIGN( auto transferred_literal, - local_client_->ShapedBufferToLiteral(*shaped_buffer)); + local_client_->ShapedBufferToLiteral(shaped_buffer)); EXPECT_EQ(literal, *transferred_literal); }; @@ -849,7 +837,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { literal, local_client_->default_device_ordinal(), allocator_)); TF_ASSERT_OK_AND_ASSIGN( auto transferred_literal, - local_client_->ShapedBufferToLiteral(*shaped_buffer)); + local_client_->ShapedBufferToLiteral(shaped_buffer)); EXPECT_EQ(literal, *transferred_literal); }; @@ -917,12 +905,12 @@ void BM_LocalClientOverhead(int num_iters) { .ConsumeValueOrDie(); auto literal = Literal::CreateR2({{0, 0, 0}, {0, 0, 0}}); ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *literal, *buffer)); + executors[device_ordinal], *literal, buffer)); const int kWarmups = 2; auto executable_status = client->Compile( - computation, {&buffer->on_host_shape()}, ExecutableBuildOptions()); + computation, {&buffer.on_host_shape()}, ExecutableBuildOptions()); ASSERT_IS_OK(executable_status); std::unique_ptr executable = executable_status.ConsumeValueOrDie(); @@ -934,13 +922,13 @@ void BM_LocalClientOverhead(int num_iters) { run_options.set_allocator(&allocator).set_stream(&stream); for (int i = 0; i < kWarmups; ++i) { - auto result = executable->Run({buffer.get()}, run_options); + auto result = executable->Run({&buffer}, run_options); ASSERT_IS_OK(result); } tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = executable->Run({buffer.get()}, run_options); + auto result = executable->Run({&buffer}, run_options); ASSERT_IS_OK(result); } } diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index 29fd985acfc..c60ba2422f4 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -128,7 +128,7 @@ LocalClientTestBase::LocalClientTestBase(se::Platform* platform) LocalClientTestBase::~LocalClientTestBase() {} -std::unique_ptr LocalClientTestBase::LiteralToShapedBuffer( +ScopedShapedBuffer LocalClientTestBase::LiteralToShapedBuffer( const Literal& literal) { return local_client_ ->LiteralToShapedBuffer(literal, local_client_->default_device_ordinal()) @@ -155,7 +155,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const { return run_options; } -std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( +ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( const Computation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), @@ -163,7 +163,7 @@ std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( .ConsumeValueOrDie(); } -std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( +ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( const Computation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, @@ -172,16 +172,14 @@ std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( .ConsumeValueOrDie(); } -StatusOr> -LocalClientTestBase::ExecuteLocally( +StatusOr LocalClientTestBase::ExecuteLocally( const Computation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()); } -StatusOr> -LocalClientTestBase::ExecuteLocally( +StatusOr LocalClientTestBase::ExecuteLocally( const Computation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index 7555d5e8938..4ee56a05ec6 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -83,8 +83,7 @@ class LocalClientTestBase : public ::testing::Test { // Copy the given literal onto the default device and return a // ScopedShapedBuffer. Convenience wrapper around // LocalClient::LiteralToShapedBuffer. - std::unique_ptr LiteralToShapedBuffer( - const Literal& literal); + ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal); // Construct and return a literal containing the array represented by // shaped_buffer. @@ -93,19 +92,19 @@ class LocalClientTestBase : public ::testing::Test { // Execute the given computation on the local client. With and without // options. - StatusOr> ExecuteLocally( + StatusOr ExecuteLocally( const Computation& computation, tensorflow::gtl::ArraySlice arguments); - StatusOr> ExecuteLocally( + StatusOr ExecuteLocally( const Computation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); - std::unique_ptr ExecuteLocallyOrDie( + ScopedShapedBuffer ExecuteLocallyOrDie( const Computation& computation, tensorflow::gtl::ArraySlice arguments); - std::unique_ptr ExecuteLocallyOrDie( + ScopedShapedBuffer ExecuteLocallyOrDie( const Computation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc index 268ba338f2e..e2067bc1b83 100644 --- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc +++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc @@ -45,7 +45,7 @@ class TransferManagerTest : public LocalClientTestBase { ~TransferManagerTest() override = default; - std::unique_ptr AllocateDeviceBuffer(const Shape& shape) { + ScopedShapedBuffer AllocateDeviceBuffer(const Shape& shape) { return transfer_manager_ ->AllocateScopedShapedBuffer( shape, GetOrCreateAllocator(local_client_->platform()), @@ -64,10 +64,10 @@ XLA_TEST_F(TransferManagerTest, TransferR0U32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR0Equal(42, *result); } @@ -80,10 +80,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR1Equal({1.25f, 2.5f, -17.0f, -20.125f}, *result); @@ -98,10 +98,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR1Equal(test_vector, *result); } @@ -114,10 +114,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); EXPECT_EQ(result->GetR1U8AsString(), test_string); } @@ -130,10 +130,10 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result); @@ -150,10 +150,10 @@ XLA_TEST_F(TransferManagerTest, // Round trip literal through device. Set the on-device layout to something // different than the literal layout. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); EXPECT_FALSE( LayoutUtil::Equal(result->shape().layout(), literal->shape().layout())); @@ -170,10 +170,10 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -184,10 +184,10 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -204,10 +204,10 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -219,10 +219,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -238,10 +238,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index efb00d56c58..837a01e873e 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -129,18 +129,18 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, auto* transfer_manager = backend->transfer_manager(); TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr lhs_arg, + ScopedShapedBuffer lhs_arg, transfer_manager->AllocateScopedShapedBuffer( lhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg)); + executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg)); TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr rhs_arg, + ScopedShapedBuffer rhs_arg, transfer_manager->AllocateScopedShapedBuffer( rhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg)); + executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg)); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr local_executable, @@ -165,7 +165,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, backend->eigen_intra_op_thread_pool()); TF_ASSERT_OK_AND_ASSIGN( auto execution_result, - executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()}, + executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg}, &hlo_execution_profile)); (void)execution_result; From d710d01a015fda65348ac0e5c25be3747624a779 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Thu, 19 Apr 2018 17:21:50 -0700 Subject: [PATCH 0475/1734] Minor code refactoring. PiperOrigin-RevId: 193600173 --- tensorflow/core/kernels/data/BUILD | 3 ++- tensorflow/core/kernels/data/dataset_utils.cc | 13 +++++++++++++ tensorflow/core/kernels/data/dataset_utils.h | 2 ++ tensorflow/core/kernels/data/iterator_ops.cc | 13 ++----------- tensorflow/core/kernels/data/writer_ops.cc | 15 ++------------- 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 667a6967a85..c78e0aff833 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -515,6 +515,7 @@ tf_kernel_library( srcs = ["iterator_ops.cc"], deps = [ ":dataset", + ":dataset_utils", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:dataset_ops_op_lib", "//tensorflow/core:framework", @@ -586,7 +587,7 @@ tf_kernel_library( srcs = ["writer_ops.cc"], deps = [ ":dataset", - "//tensorflow/core:core_cpu_internal", + ":dataset_utils", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc index e3a3601ee84..67ddb52d577 100644 --- a/tensorflow/core/kernels/data/dataset_utils.cc +++ b/tensorflow/core/kernels/data/dataset_utils.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/kernels/data/dataset_utils.h" +#include "tensorflow/core/common_runtime/device.h" namespace tensorflow { @@ -45,6 +46,18 @@ Status MakeIteratorFromInputElement( return Status::OK(); } +IteratorContext MakeIteratorContext(OpKernelContext* ctx) { + IteratorContext::Params params; + params.env = ctx->env(); + params.runner = *(ctx->runner()); + params.lib = ctx->function_library(); + DeviceBase* device = ctx->function_library()->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + return IteratorContext(params); +} + } // namespace dataset } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h index 6c4191c2be6..e5ca71dd99d 100644 --- a/tensorflow/core/kernels/data/dataset_utils.h +++ b/tensorflow/core/kernels/data/dataset_utils.h @@ -28,6 +28,8 @@ Status MakeIteratorFromInputElement( int64 thread_index, CapturedFunction* captured_func, StringPiece prefix, std::unique_ptr* out_iterator); +IteratorContext MakeIteratorContext(OpKernelContext* ctx); + } // namespace dataset } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 4e4997d7b3f..f5db97fd59e 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/variant_op_registry.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/kernels/data/dataset_utils.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/cleanup.h" @@ -609,17 +610,7 @@ class ToSingleElementOp : public AsyncOpKernel { ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); auto iterator = dataset->MakeIterator("SingleElementIterator"); - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); - params.lib = ctx->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - - IteratorContext iter_ctx(std::move(params)); - + IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx); std::vector components; components.reserve(dataset->output_dtypes().size()); bool end_of_sequence; diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc index 46821fd7b3a..656fee1e856 100644 --- a/tensorflow/core/kernels/data/writer_ops.cc +++ b/tensorflow/core/kernels/data/writer_ops.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/kernels/data/dataset_utils.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/io/record_writer.h" @@ -72,21 +72,10 @@ class ToTFRecordOp : public AsyncOpKernel { ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); auto iterator = dataset->MakeIterator("ToTFRecordOpIterator"); - IteratorContext::Params params; // TODO(b/78245447) - params.env = ctx->env(); - params.runner = *(ctx->runner()); - params.lib = ctx->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - - IteratorContext iter_ctx(std::move(params)); - + IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx); std::vector components; components.reserve(dataset->output_dtypes().size()); bool end_of_sequence; - do { OP_REQUIRES_OK_ASYNC( ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence), From c2905469335715929c630d2bd70068ccbc8eb2d1 Mon Sep 17 00:00:00 2001 From: manhyuk Date: Fri, 20 Apr 2018 09:28:37 +0900 Subject: [PATCH 0476/1734] fix typo --- tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h index 5116c8183cb..7edd10e3e8a 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.h +++ b/tensorflow/core/grappler/costs/virtual_scheduler.h @@ -212,7 +212,7 @@ class FirstReadyManager : public ReadyNodeManager { }; // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal -// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv +// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv // ops, and then it chooses FirstReady among the ops chosen from each // internal NodeManagers. The objective is to maximize producer-consumer // locality within device, while processing nodes across devices, including From 28a95990bf9ff228abec6a52389a4244a17a9101 Mon Sep 17 00:00:00 2001 From: manhyuk Date: Fri, 20 Apr 2018 09:28:45 +0900 Subject: [PATCH 0477/1734] fix typo --- tensorflow/core/grappler/costs/virtual_scheduler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h index 7edd10e3e8a..67bf1e6980e 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.h +++ b/tensorflow/core/grappler/costs/virtual_scheduler.h @@ -199,7 +199,7 @@ class FirstReadyManager : public ReadyNodeManager { // current node. std::vector nodes_; // Newly added nodes are added to waiting_queue_. That way, GetCurrNode(), - // wihch returns the front of the nodes_, always returns the same node, + // which returns the front of the nodes_, always returns the same node, // even if any of new nodes has time_ready smaller than the current node's. std::vector waiting_queue_; // Comparator functor for heap; stl heap is max heap, so we use "greater than" From c18a80967e55350affafbf2ff562056d4bddf234 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 17:26:41 -0700 Subject: [PATCH 0478/1734] Add support for non-Tensor args in recompute_grad Previously, the function decorated by recompute_grad had to have a signature that contained only positional arguments, and all those arguments had to be Tensors. Most "layers" users define however have non-Tensor arguments (for example, various hyperparameters) and often have keyword arguments as well. This change allows a user to use whatever function signature they wish while being explicit about which arguments are Tensors. PiperOrigin-RevId: 193600682 --- .../layers/python/layers/rev_block_lib.py | 77 +++++++++++-- .../python/layers/rev_block_lib_test.py | 102 ++++++++++++++++++ 2 files changed, 168 insertions(+), 11 deletions(-) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py index 02d294c68f1..9f904cc3028 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py @@ -45,6 +45,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest +from tensorflow.python.util import tf_inspect __all__ = ["rev_block", "RevBlock", "recompute_grad"] @@ -429,12 +430,13 @@ def enable_with_args(dec): @enable_with_args -def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False): +def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False, + tensor_arg_names=None): """Decorator that recomputes the function on the backwards pass. Args: - fn: a function that takes Tensors (all as positional arguments) and returns - a tuple of Tensors. + fn: the subgraph-producing function to wrap and recompute when computing + gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s. use_data_dep: `bool`, if `True` will use a dummy data dependency to force the recompute to happen. If `False` will use a control dependency. By default will be `True` if in an XLA context and `False` otherwise. XLA @@ -443,17 +445,25 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False): that all gradients are produced before any are consumed by downstream ops. If `use_data_dep` is also `True`, will use a data dependency instead of a control dependency. + tensor_arg_names: `list`, names of the `Tensor` arguments to `fn`. If + `None`, assumes all arguments are `Tensor`s. Returns: A wrapped fn that is identical to fn when called, but its activations will be discarded and recomputed on the backwards pass (i.e. on a call to tf.gradients). """ + if tensor_arg_names: + if not isinstance(tensor_arg_names, (list, tuple)): + raise TypeError("tensor_arg_names must be a list") @functools.wraps(fn) - def wrapped(*args): + def wrapped(*args, **kwargs): + tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs, + tensor_arg_names) return _recompute_grad( - fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads) + tensor_only_fn, tensor_args, use_data_dep=use_data_dep, + tupleize_grads=tupleize_grads) return wrapped @@ -463,11 +473,59 @@ def _is_on_tpu(): return control_flow_util.GetContainingXLAContext(ctxt) is not None -def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False): +def _make_tensor_only(fn, args, kwargs, tensor_arg_names): + """Return fn such that it only takes Tensor args for tensor_arg_names.""" + argspec = tf_inspect.getargspec(fn) + if argspec.varargs is not None or argspec.keywords is not None: + raise ValueError("Function decorated with recompute_grad must not use " + "*args or **kwargs.") + fn_arg_names = list(argspec.args) + + # name_to_arg is a dict of argument name to argument value, including both + # positional and keyword arguments passed. + name_to_arg = {} + # Populate positional arguments. + for name, arg in zip(fn_arg_names[:len(args)], args): + name_to_arg[name] = arg + # Populate keyword arguments. + name_to_arg.update(kwargs) + + # Separate the Tensor arguments from the non-Tensor arguments. + # The default is that all arguments are Tensor arguments. + tensor_arg_names = tensor_arg_names or fn_arg_names + for name in tensor_arg_names: + if name not in name_to_arg: + raise ValueError("Must provide Tensor argument %s" % name) + tensor_args = [name_to_arg[name] for name in tensor_arg_names] + non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items() + if name not in tensor_arg_names]) + + # Check that Tensor arguments are in fact Tensors and that non-Tensor + # arguments are not. + for name, arg in zip(tensor_arg_names, tensor_args): + if not isinstance(arg, framework_ops.Tensor): + raise TypeError("Fn argument %s must be a Tensor." % name) + for name, arg in non_tensor_kwargs.items(): + if isinstance(arg, framework_ops.Tensor): + raise TypeError("Fn argument %s must not be a Tensor." % name) + + # Construct a Tensor-only wrapper function that will pass the non-Tensor + # arguments as well when called. + def tensor_only_fn(*tensors): + all_kwargs = dict(zip(tensor_arg_names, tensors)) + all_kwargs.update(non_tensor_kwargs) + return fn(**all_kwargs) + + return tensor_only_fn, tensor_args + + +def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, + tupleize_grads=False): """See recompute_grad.""" for arg in args: if not isinstance(arg, framework_ops.Tensor): raise ValueError("All inputs to function must be Tensors") + use_data_dep_ = use_data_dep if use_data_dep_ == _USE_DEFAULT: use_data_dep_ = _is_on_tpu() @@ -501,14 +559,11 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False): grad_vars = grads[len(inputs):] return grad_inputs, grad_vars + # TODO(rsepassi): Replace with tf.custom_gradient @_fn_with_custom_grad(grad_fn) def fn_with_recompute(*args): cached_vs.append(variable_scope.get_variable_scope()) - # TODO(rsepassi): Rm conditional in TF 1.4 - if hasattr(contrib_framework_ops, "current_arg_scope"): - cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) - else: - cached_arg_scope.append({}) + cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) return fn(*args) return fn_with_recompute(*args) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py index 392a490be15..66ccc696f92 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py @@ -318,6 +318,108 @@ class RecomputeTest(test.TestCase): self.assertEqual(1, len(grads)) self.assertTrue(grads[0] is not None) + def testWithNontensorArgs(self): + @rev_block_lib.recompute_grad(tupleize_grads=True, + tensor_arg_names=["inputs"]) + def layer_with_recompute(inputs, plus=None): + var = variable_scope.get_variable("var", ()) + self.assertFalse(plus) # called with False below + if plus: + return var + inputs + else: + return var * inputs + + inputs = array_ops.ones((), dtypes.float32) + outputs = layer_with_recompute(inputs, plus=False) + loss = math_ops.square(outputs) + grads = gradients_impl.gradients(loss, variables.trainable_variables()) + self.assertEqual(1, len(grads)) + self.assertTrue(grads[0] is not None) + + +class MakeTensorOnlyTest(test.TestCase): + + def testMakeTensorOnly(self): + def fn(a, b, c, d=1, e=None, f=7): + return (a, b, c, d, e, f) + + t1 = array_ops.ones(()) + t2 = array_ops.ones(()) + t3 = array_ops.ones(()) + args = [1, t1, 3, t2] + kwargs = {"e": t3} + tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( + fn, args, kwargs, ["b", "d", "e"]) + self.assertAllEqual(tensor_args, [t1, t2, t3]) + out = tensor_only_fn(*tensor_args) + self.assertAllEqual(out, (1, t1, 3, t2, t3, 7)) + + def testMakeTensorOnlyPositionalArgsOnly(self): + def fn(a, b, c): + return (a, b, c) + + t1 = array_ops.ones(()) + t2 = array_ops.ones(()) + args = [t1, 3, t2] + tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( + fn, args, {}, ["a", "c"]) + self.assertAllEqual(tensor_args, [t1, t2]) + out = tensor_only_fn(*tensor_args) + self.assertAllEqual(out, (t1, 3, t2)) + + def testMakeTensorOnlyKwargsArgsOnly(self): + def fn(a=1, b=2, c=3): + return (a, b, c) + + t1 = array_ops.ones(()) + t2 = array_ops.ones(()) + args = [t1] + kwargs = {"c": t2} + tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( + fn, args, kwargs, ["a", "c"]) + self.assertAllEqual(tensor_args, [t1, t2]) + out = tensor_only_fn(*tensor_args) + self.assertAllEqual(out, (t1, 2, t2)) + + def testErrorOnMissingTensorArg(self): + def fn(a, b): + return (a, b) + + with self.assertRaisesWithPredicateMatch( + ValueError, "provide Tensor argument"): + rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"]) + + def testErrorOnSignatureSplats(self): + def fn1(a, *args): + return (a, args) + + err_msg = r"must not use \*args or \*\*kwargs" + with self.assertRaisesWithPredicateMatch(ValueError, err_msg): + rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"]) + + def fn2(a, **kwargs): + return (a, kwargs) + + with self.assertRaisesWithPredicateMatch(ValueError, err_msg): + rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"]) + + def testErrorOnNonTensorForTensor(self): + def fn(a, b): + return (a, b) + + with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"): + rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"]) + + def testErrorOnTensorForNonTensor(self): + def fn(a, b): + return (a, b) + + with self.assertRaisesWithPredicateMatch( + TypeError, "must not be a Tensor"): + t1 = array_ops.ones(()) + t2 = array_ops.ones(()) + rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"]) + class FnWithCustomGradTest(test.TestCase): From 13a7e9820a800cf3877e5a44b9f654f79808a2d4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 17:27:04 -0700 Subject: [PATCH 0479/1734] Update DecodeProtoOp so that it returns explicitly specified default values for missing fields. PiperOrigin-RevId: 193600735 --- .../kernel_tests/defaut_values.TestCase.pbtxt | 94 +++++++++ .../promote_unsigned.TestCase.pbtxt | 10 +- .../python/kernel_tests/test_example.proto | 33 +++ tensorflow/core/kernels/decode_proto_op.cc | 188 +++++++++++++++--- 4 files changed, 300 insertions(+), 25 deletions(-) create mode 100644 tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt diff --git a/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt new file mode 100644 index 00000000000..4e316819077 --- /dev/null +++ b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt @@ -0,0 +1,94 @@ +primitive { + # No fields specified, so we get all defaults +} +shape: 1 +sizes: 0 +field { + name: "double_default" + dtype: DT_DOUBLE + expected { double_value: 1.0 } +} +sizes: 0 +field { + name: "float_default" + dtype: DT_DOUBLE # Try casting the float field to double. + expected { double_value: 2.0 } +} +sizes: 0 +field { + name: "int64_default" + dtype: DT_INT64 + expected { int64_value: 3 } +} +sizes: 0 +field { + name: "uint64_default" + dtype: DT_INT64 + expected { int64_value: 4 } +} +sizes: 0 +field { + name: "int32_default" + dtype: DT_INT32 + expected { int32_value: 5 } +} +sizes: 0 +field { + name: "fixed64_default" + dtype: DT_INT64 + expected { int64_value: 6 } +} +sizes: 0 +field { + name: "fixed32_default" + dtype: DT_INT32 + expected { int32_value: 7 } +} +sizes: 0 +field { + name: "bool_default" + dtype: DT_BOOL + expected { bool_value: true } +} +sizes: 0 +field { + name: "string_default" + dtype: DT_STRING + expected { string_value: "a" } +} +sizes: 0 +field { + name: "bytes_default" + dtype: DT_STRING + expected { string_value: "a longer default string" } +} +sizes: 0 +field { + name: "uint32_default" + dtype: DT_INT32 + expected { int32_value: -1 } +} +sizes: 0 +field { + name: "sfixed32_default" + dtype: DT_INT32 + expected { int32_value: 10 } +} +sizes: 0 +field { + name: "sfixed64_default" + dtype: DT_INT64 + expected { int64_value: 11 } +} +sizes: 0 +field { + name: "sint32_default" + dtype: DT_INT32 + expected { int32_value: 12 } +} +sizes: 0 +field { + name: "sint64_default" + dtype: DT_INT64 + expected { int64_value: 13 } +} diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt index db7555bf2df..bc07efc8f30 100644 --- a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt +++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt @@ -4,7 +4,6 @@ primitive { } shape: 1 sizes: 1 -sizes: 1 field { name: "fixed32_value" dtype: DT_INT64 @@ -12,6 +11,7 @@ field { int64_value: 4294967295 } } +sizes: 1 field { name: "uint32_value" dtype: DT_INT64 @@ -19,3 +19,11 @@ field { int64_value: 4294967295 } } +sizes: 0 +field { + name: "uint32_default" + dtype: DT_INT64 + expected { + int64_value: 4294967295 # Comes from an explicitly-specified default + } +} diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto index dc495034ffa..a2c88e372bf 100644 --- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto +++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto @@ -72,6 +72,23 @@ message RepeatedPrimitiveValue { repeated sint32 sint32_value = 17; repeated sint64 sint64_value = 18; repeated PrimitiveValue message_value = 19; + + // Optional fields with explicitly-specified defaults. + optional double double_default = 20 [default = 1.0]; + optional float float_default = 21 [default = 2.0]; + optional int64 int64_default = 22 [default = 3]; + optional uint64 uint64_default = 23 [default = 4]; + optional int32 int32_default = 24 [default = 5]; + optional fixed64 fixed64_default = 25 [default = 6]; + optional fixed32 fixed32_default = 26 [default = 7]; + optional bool bool_default = 27 [default = true]; + optional string string_default = 28 [default = "a"]; + optional bytes bytes_default = 29 [default = "a longer default string"]; + optional uint32 uint32_default = 30 [default = 4294967295]; + optional sfixed32 sfixed32_default = 31 [default = 10]; + optional sfixed64 sfixed64_default = 32 [default = 11]; + optional sint32 sint32_default = 33 [default = 12]; + optional sint64 sint64_default = 34 [default = 13]; } // A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue @@ -97,6 +114,22 @@ message PackedPrimitiveValue { repeated sint32 sint32_value = 17 [packed = true]; repeated sint64 sint64_value = 18 [packed = true]; repeated PrimitiveValue message_value = 19; + + optional double double_default = 20 [default = 1.0]; + optional float float_default = 21 [default = 2.0]; + optional int64 int64_default = 22 [default = 3]; + optional uint64 uint64_default = 23 [default = 4]; + optional int32 int32_default = 24 [default = 5]; + optional fixed64 fixed64_default = 25 [default = 6]; + optional fixed32 fixed32_default = 26 [default = 7]; + optional bool bool_default = 27 [default = true]; + optional string string_default = 28 [default = "a"]; + optional bytes bytes_default = 29 [default = "a longer default string"]; + optional uint32 uint32_default = 30 [default = 4294967295]; + optional sfixed32 sfixed32_default = 31 [default = 10]; + optional sfixed64 sfixed64_default = 32 [default = 11]; + optional sint32 sint32_default = 33 [default = 12]; + optional sint64 sint64_default = 34 [default = 13]; } message EnumValue { diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc index b4e5b776ed6..24f8a4f72fd 100644 --- a/tensorflow/core/kernels/decode_proto_op.cc +++ b/tensorflow/core/kernels/decode_proto_op.cc @@ -105,11 +105,137 @@ bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) { } } +// Used to store the default value of a protocol message field, casted to the +// type of the output tensor. +// +// TODO(paskin): Use absl::variant once TensorFlow gets absl dependencies. +struct DefaultValue { + DataType dtype = DataType::DT_INVALID; + union Value { + bool v_bool; // DT_BOOL + uint8 v_uint8; // DT_UINT8 + int8 v_int8; // DT_INT8 + int32 v_int32; // DT_INT32 + int64 v_int64; // DT_INT64 + float v_float; // DT_FLOAT + double v_double; // DT_DOUBLE + const char* v_string; // DT_STRING + }; + Value value; +}; + +// Initializes a DefaultValue object. This generic template handles numeric +// types and strings are handled by a template specialization below. +// +// Args: +// dtype: the type of the output tensor +// value: the default value as obtained from the FieldDescriptor +// result: the object to initialize +template +Status InitDefaultValue(DataType dtype, const T value, DefaultValue* result) { + result->dtype = dtype; + switch (dtype) { + case DT_BOOL: + result->value.v_bool = static_cast(value); + break; + case DT_INT32: + result->value.v_int32 = static_cast(value); + break; + case DT_INT8: + result->value.v_int8 = static_cast(value); + break; + case DT_UINT8: + result->value.v_uint8 = static_cast(value); + break; + case DT_INT64: + result->value.v_int64 = static_cast(value); + break; + case DT_FLOAT: + result->value.v_float = static_cast(value); + break; + case DT_DOUBLE: + result->value.v_double = static_cast(value); + break; + default: + // We should never get here, given the type checking that occurs earlier. + return errors::Internal( + "Cannot initialize default value for unsupported type: ", + DataTypeString(dtype)); + } + return Status::OK(); +} + +template <> +Status InitDefaultValue(DataType dtype, const char* value, + DefaultValue* result) { + // These are sanity checks that should never trigger given the code that + // leads here. + if (TF_PREDICT_FALSE(dtype != DT_STRING)) { + return errors::InvalidArgument( + "Cannot cast field to anything but DT_STRING"); + } + if (TF_PREDICT_FALSE(value == nullptr)) { + return errors::InvalidArgument("Null default string value."); + } + result->dtype = DT_STRING; + result->value.v_string = value; + return Status::OK(); +} + +// Initializes a default value from the output data type and the field +// descriptor. +Status InitDefaultValueFromFieldDescriptor(DataType dtype, + const FieldDescriptor* field_desc, + DefaultValue* result) { + switch (field_desc->type()) { + case WireFormatLite::TYPE_DOUBLE: + return InitDefaultValue(dtype, field_desc->default_value_double(), + result); + case WireFormatLite::TYPE_FLOAT: + return InitDefaultValue(dtype, field_desc->default_value_float(), result); + case WireFormatLite::TYPE_INT64: + case WireFormatLite::TYPE_SINT64: + case WireFormatLite::TYPE_SFIXED64: + return InitDefaultValue(dtype, field_desc->default_value_int64(), result); + case WireFormatLite::TYPE_FIXED64: + case WireFormatLite::TYPE_UINT64: + return InitDefaultValue(dtype, field_desc->default_value_uint64(), + result); + case WireFormatLite::TYPE_ENUM: + case WireFormatLite::TYPE_INT32: + case WireFormatLite::TYPE_SINT32: + case WireFormatLite::TYPE_SFIXED32: + return InitDefaultValue(dtype, field_desc->default_value_int32(), result); + case WireFormatLite::TYPE_FIXED32: + case WireFormatLite::TYPE_UINT32: + return InitDefaultValue(dtype, field_desc->default_value_uint32(), + result); + case WireFormatLite::TYPE_BOOL: + return InitDefaultValue(dtype, field_desc->default_value_bool(), result); + case WireFormatLite::TYPE_BYTES: + case WireFormatLite::TYPE_STRING: + // Manipulating default string values as C-style pointers should be OK + // for typical code-generated protocol messages. It is possible in + // principle to register a message descriptor on the fly, and these + // pointers may not be stable if that descriptor has a weird + // implementation. (But the return type of default_value_string() is + // const string&, so it'd have to be very weird.) + return InitDefaultValue(dtype, field_desc->default_value_string().c_str(), + result); + case WireFormatLite::TYPE_GROUP: + case WireFormatLite::TYPE_MESSAGE: + return InitDefaultValue(dtype, "", result); + // default: intentionally omitted in order to enable static checking. + } + return Status::OK(); +} + // A FieldInfo holds a handful of information from the FieldDescriptor // and user attributes. struct FieldInfo { - FieldInfo(const FieldDescriptor* field_desc, int user_index) - : output_index(user_index) { + FieldInfo(const FieldDescriptor* field_desc, int user_index, + DefaultValue def_value) + : output_index(user_index), default_value(def_value) { // Without this intermediate data structure, the profile had hotspots // calling methods of FieldDescriptor. number = field_desc->number(); @@ -144,6 +270,7 @@ struct FieldInfo { WireFormatLite::FieldType type; int number; bool is_repeated; + DefaultValue default_value; }; // A CountCollector counts sizes of repeated and optional fields in a proto. @@ -394,8 +521,11 @@ class DenseCollector { DenseCollector() = default; // A DenseCollector applies to one field of a serialized message. - DenseCollector(uint8* datap, DataType dtype, int max_repeat_count) - : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {} + // Note that default_value.dtype is the type of the output tensor. + DenseCollector(uint8* datap, DefaultValue default_value, int max_repeat_count) + : datap_(datap), + default_value_(default_value), + max_repeat_count_(max_repeat_count) {} // Reads a value from the input stream and stores it. // @@ -415,8 +545,8 @@ class DenseCollector { } next_repeat_index_ = index + 1; - return internal::ReadValue(input, field.type, field.number, dtype_, index, - datap_); + return internal::ReadValue(input, field.type, field.number, + default_value_.dtype, index, datap_); } // Reads and stores a length-delimited list of values. @@ -445,8 +575,8 @@ class DenseCollector { field.number, ", Max entries allowed: ", max_repeat_count_); } else { return internal::ReadPackedFromArray(buf, buf_size, field.type, - field.number, dtype_, stride, - &next_repeat_index_, datap_); + field.number, default_value_.dtype, + stride, &next_repeat_index_, datap_); } } @@ -454,23 +584,23 @@ class DenseCollector { // Dispatches to the appropriately typed field default based on the // runtime type tag. Status FillWithDefaults() { - switch (dtype_) { + switch (default_value_.dtype) { case DataType::DT_FLOAT: - return FillDefault(); + return FillDefault(default_value_.value.v_float); case DataType::DT_DOUBLE: - return FillDefault(); + return FillDefault(default_value_.value.v_double); case DataType::DT_INT32: - return FillDefault(); + return FillDefault(default_value_.value.v_int32); case DataType::DT_UINT8: - return FillDefault(); + return FillDefault(default_value_.value.v_uint8); case DataType::DT_INT8: - return FillDefault(); + return FillDefault(default_value_.value.v_int8); case DataType::DT_STRING: - return FillDefault(); + return FillDefault(default_value_.value.v_string); case DataType::DT_INT64: - return FillDefault(); + return FillDefault(default_value_.value.v_int64); case DataType::DT_BOOL: - return FillDefault(); + return FillDefault(default_value_.value.v_bool); default: // There are many tensorflow dtypes not handled here, but they // should not come up unless type casting is added to the Op. @@ -485,9 +615,9 @@ class DenseCollector { // default value. This uses next_repeat_index_ which counts the number // of parsed values for the field. template - Status FillDefault() { + Status FillDefault(const T& default_value) { for (int i = next_repeat_index_; i < max_repeat_count_; i++) { - reinterpret_cast(datap_)[i] = T(); + reinterpret_cast(datap_)[i] = default_value; } return Status::OK(); } @@ -501,7 +631,7 @@ class DenseCollector { // for more items than we have allocated space. void* const datap_ = nullptr; - const DataType dtype_ = DataType::DT_INVALID; + const DefaultValue default_value_; const int max_repeat_count_ = 0; }; @@ -577,8 +707,14 @@ class DecodeProtoOp : public OpKernel { // Now store the fields in sorted order. for (int i = 0; i < field_names.size(); i++) { - fields_.push_back(MakeUnique(field_descs[output_indices[i]], - output_indices[i])); + const int output_index = output_indices[i]; + const DataType dtype = output_types[output_index]; + const FieldDescriptor* field_descriptor = field_descs[output_index]; + DefaultValue default_value; + OP_REQUIRES_OK(context, InitDefaultValueFromFieldDescriptor( + dtype, field_descriptor, &default_value)); + fields_.push_back( + MakeUnique(field_descriptor, output_index, default_value)); } message_prototype_ = message_factory_.GetPrototype(message_desc); @@ -805,9 +941,13 @@ class DecodeProtoOp : public OpKernel { std::vector collectors; collectors.reserve(field_count); - for (const TensorInfo& info : tensors) { + for (int output_index = 0; output_index < field_count; ++output_index) { + const TensorInfo& info = tensors[output_index]; + const FieldInfo* field_info = fields_[output_index].get(); + DCHECK(field_info != nullptr); + const DefaultValue default_value = field_info->default_value; collectors.emplace_back(info.data + message_index * info.stride, - info.dtype, info.last_dim_size); + default_value, info.last_dim_size); } // Fill in output tensors from the wire. From 976229dcbfde389864069433ebfc4085015df9c1 Mon Sep 17 00:00:00 2001 From: Austin Anderson Date: Thu, 19 Apr 2018 17:30:49 -0700 Subject: [PATCH 0480/1734] Internal testing changes PiperOrigin-RevId: 193601134 --- tensorflow/contrib/lite/kernels/BUILD | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 8cfa7e53d1d..80cefe83b29 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -212,6 +212,7 @@ tf_cc_test( name = "audio_spectrogram_test", size = "small", srcs = ["audio_spectrogram_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -225,6 +226,7 @@ tf_cc_test( name = "mfcc_test", size = "small", srcs = ["mfcc_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -346,6 +348,7 @@ tf_cc_test( name = "cast_test", size = "small", srcs = ["cast_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -398,6 +401,7 @@ tf_cc_test( name = "dequantize_test", size = "small", srcs = ["dequantize_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -504,6 +508,7 @@ tf_cc_test( name = "maximum_minimum_test", size = "small", srcs = ["maximum_minimum_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", From 7f87125dceb3c69c5fd1d0712c6c93cc4ceaa854 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 19 Apr 2018 17:39:09 -0700 Subject: [PATCH 0481/1734] internal END_PUBLIC BEGIN_PUBLIC Automated g4 rollback of changelist 193571934 PiperOrigin-RevId: 193602050 --- tensorflow/core/lib/io/record_reader.cc | 149 ++++++++++---- tensorflow/core/lib/io/record_reader.h | 18 +- tensorflow/core/lib/io/recordio_test.cc | 216 +++++++-------------- tensorflow/core/lib/io/zlib_inputstream.cc | 9 +- tensorflow/core/lib/io/zlib_inputstream.h | 10 +- 5 files changed, 192 insertions(+), 210 deletions(-) diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc index c24628be570..6de850bb207 100644 --- a/tensorflow/core/lib/io/record_reader.cc +++ b/tensorflow/core/lib/io/record_reader.cc @@ -56,55 +56,110 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions( RecordReader::RecordReader(RandomAccessFile* file, const RecordReaderOptions& options) - : options_(options), - input_stream_(new RandomAccessInputStream(file)), - last_read_failed_(false) { + : src_(file), options_(options) { if (options.buffer_size > 0) { - input_stream_.reset(new BufferedInputStream(input_stream_.release(), - options.buffer_size, true)); + input_stream_.reset(new BufferedInputStream(file, options.buffer_size)); + } else { + input_stream_.reset(new RandomAccessInputStream(file)); } if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) { // We don't have zlib available on all embedded platforms, so fail. #if defined(IS_SLIM_BUILD) LOG(FATAL) << "Zlib compression is unsupported on mobile platforms."; #else // IS_SLIM_BUILD - input_stream_.reset(new ZlibInputStream( - input_stream_.release(), options.zlib_options.input_buffer_size, - options.zlib_options.output_buffer_size, options.zlib_options, true)); + zlib_input_stream_.reset(new ZlibInputStream( + input_stream_.get(), options.zlib_options.input_buffer_size, + options.zlib_options.output_buffer_size, options.zlib_options)); #endif // IS_SLIM_BUILD } else if (options.compression_type == RecordReaderOptions::NONE) { // Nothing to do. } else { - LOG(FATAL) << "Unrecognized compression type :" << options.compression_type; + LOG(FATAL) << "Unspecified compression type :" << options.compression_type; } } // Read n+4 bytes from file, verify that checksum of first n bytes is // stored in the last 4 bytes and store the first n bytes in *result. -// -// offset corresponds to the user-provided value to ReadRecord() -// and is used only in error messages. -Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) { +// May use *storage as backing store. +Status RecordReader::ReadChecksummed(uint64 offset, size_t n, + StringPiece* result, string* storage) { if (n >= SIZE_MAX - sizeof(uint32)) { return errors::DataLoss("record size too large"); } const size_t expected = n + sizeof(uint32); - TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result)); + storage->resize(expected); - if (result->size() != expected) { - if (result->empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); +#if !defined(IS_SLIM_BUILD) + if (zlib_input_stream_) { + // If we have a zlib compressed buffer, we assume that the + // file is being read sequentially, and we use the underlying + // implementation to read the data. + // + // No checks are done to validate that the file is being read + // sequentially. At some point the zlib input buffer may support + // seeking, possibly inefficiently. + TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage)); + + if (storage->size() != expected) { + if (storage->empty()) { + return errors::OutOfRange("eof"); + } else { + return errors::DataLoss("truncated record at ", offset); + } } - } - const uint32 masked_crc = core::DecodeFixed32(result->data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) { - return errors::DataLoss("corrupted record at ", offset); + uint32 masked_crc = core::DecodeFixed32(storage->data() + n); + if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { + return errors::DataLoss("corrupted record at ", offset); + } + *result = StringPiece(storage->data(), n); + } else { +#endif // IS_SLIM_BUILD + if (options_.buffer_size > 0) { + // If we have a buffer, we assume that the file is being read + // sequentially, and we use the underlying implementation to read the + // data. + // + // No checks are done to validate that the file is being read + // sequentially. + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage)); + + if (storage->size() != expected) { + if (storage->empty()) { + return errors::OutOfRange("eof"); + } else { + return errors::DataLoss("truncated record at ", offset); + } + } + + const uint32 masked_crc = core::DecodeFixed32(storage->data() + n); + if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { + return errors::DataLoss("corrupted record at ", offset); + } + *result = StringPiece(storage->data(), n); + } else { + // This version supports reading from arbitrary offsets + // since we are accessing the random access file directly. + StringPiece data; + TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0])); + if (data.size() != expected) { + if (data.empty()) { + return errors::OutOfRange("eof"); + } else { + return errors::DataLoss("truncated record at ", offset); + } + } + const uint32 masked_crc = core::DecodeFixed32(data.data() + n); + if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) { + return errors::DataLoss("corrupted record at ", offset); + } + *result = StringPiece(data.data(), n); + } +#if !defined(IS_SLIM_BUILD) } - result->resize(n); +#endif // IS_SLIM_BUILD + return Status::OK(); } @@ -112,42 +167,50 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) { static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32); static const size_t kFooterSize = sizeof(uint32); - // Position the input stream. - int64 curr_pos = input_stream_->Tell(); - int64 desired_pos = static_cast(*offset); - if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ || - (curr_pos == desired_pos && last_read_failed_)) { - last_read_failed_ = false; - TF_RETURN_IF_ERROR(input_stream_->Reset()); - TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos)); - } else if (curr_pos < desired_pos) { - TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos)); - } - DCHECK_EQ(desired_pos, input_stream_->Tell()); - // Read header data. - Status s = ReadChecksummed(*offset, sizeof(uint64), record); + StringPiece lbuf; + Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record); if (!s.ok()) { - last_read_failed_ = true; return s; } - const uint64 length = core::DecodeFixed64(record->data()); + const uint64 length = core::DecodeFixed64(lbuf.data()); // Read data - s = ReadChecksummed(*offset + kHeaderSize, length, record); + StringPiece data; + s = ReadChecksummed(*offset + kHeaderSize, length, &data, record); if (!s.ok()) { - last_read_failed_ = true; if (errors::IsOutOfRange(s)) { s = errors::DataLoss("truncated record at ", *offset); } return s; } + if (record->data() != data.data()) { + // RandomAccessFile placed the data in some other location. + memmove(&(*record)[0], data.data(), data.size()); + } + + record->resize(data.size()); + *offset += kHeaderSize + length + kFooterSize; - DCHECK_EQ(*offset, input_stream_->Tell()); return Status::OK(); } +Status RecordReader::SkipNBytes(uint64 offset) { +#if !defined(IS_SLIM_BUILD) + if (zlib_input_stream_) { + TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset)); + } else { +#endif + if (options_.buffer_size > 0) { + TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset)); + } +#if !defined(IS_SLIM_BUILD) + } +#endif + return Status::OK(); +} // namespace io + SequentialRecordReader::SequentialRecordReader( RandomAccessFile* file, const RecordReaderOptions& options) : underlying_(file, options), offset_(0) {} diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h index f6d587dfa0e..26278e03284 100644 --- a/tensorflow/core/lib/io/record_reader.h +++ b/tensorflow/core/lib/io/record_reader.h @@ -69,14 +69,25 @@ class RecordReader { // Read the record at "*offset" into *record and update *offset to // point to the offset of the next record. Returns OK on success, // OUT_OF_RANGE for end of file, or something else for an error. + // + // Note: if buffering is used (with or without compression), access must be + // sequential. Status ReadRecord(uint64* offset, string* record); - private: - Status ReadChecksummed(uint64 offset, size_t n, string* result); + // Skip the records till "offset". Returns OK on success, + // OUT_OF_RANGE for end of file, or something else for an error. + Status SkipNBytes(uint64 offset); + private: + Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result, + string* storage); + + RandomAccessFile* src_; RecordReaderOptions options_; std::unique_ptr input_stream_; - bool last_read_failed_; +#if !defined(IS_SLIM_BUILD) + std::unique_ptr zlib_input_stream_; +#endif // IS_SLIM_BUILD TF_DISALLOW_COPY_AND_ASSIGN(RecordReader); }; @@ -110,6 +121,7 @@ class SequentialRecordReader { return errors::InvalidArgument( "Trying to seek offset: ", offset, " which is less than the current offset: ", offset_); + TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_)); offset_ = offset; return Status::OK(); } diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc index da514bd21c7..63235761d92 100644 --- a/tensorflow/core/lib/io/recordio_test.cc +++ b/tensorflow/core/lib/io/recordio_test.cc @@ -26,11 +26,10 @@ limitations under the License. namespace tensorflow { namespace io { -namespace { // Construct a string of the specified length made out of the supplied // partial string. -string BigString(const string& partial_string, size_t n) { +static string BigString(const string& partial_string, size_t n) { string result; while (result.size() < n) { result.append(partial_string); @@ -40,66 +39,62 @@ string BigString(const string& partial_string, size_t n) { } // Construct a string from a number -string NumberString(int n) { +static string NumberString(int n) { char buf[50]; snprintf(buf, sizeof(buf), "%d.", n); return string(buf); } // Return a skewed potentially long string -string RandomSkewedString(int i, random::SimplePhilox* rnd) { +static string RandomSkewedString(int i, random::SimplePhilox* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } -class StringDest : public WritableFile { - public: - explicit StringDest(string* contents) : contents_(contents) {} - - Status Close() override { return Status::OK(); } - Status Flush() override { return Status::OK(); } - Status Sync() override { return Status::OK(); } - Status Append(const StringPiece& slice) override { - contents_->append(slice.data(), slice.size()); - return Status::OK(); - } - - private: - string* contents_; -}; - -class StringSource : public RandomAccessFile { - public: - explicit StringSource(string* contents) - : contents_(contents), force_error_(false) {} - - Status Read(uint64 offset, size_t n, StringPiece* result, - char* scratch) const override { - if (force_error_) { - force_error_ = false; - return errors::DataLoss("read error"); - } - - if (offset >= contents_->size()) { - return errors::OutOfRange("end of file"); - } - - if (contents_->size() < offset + n) { - n = contents_->size() - offset; - } - *result = StringPiece(contents_->data() + offset, n); - return Status::OK(); - } - - void force_error() { force_error_ = true; } - - private: - string* contents_; - mutable bool force_error_; -}; - class RecordioTest : public ::testing::Test { private: - string contents_; + class StringDest : public WritableFile { + public: + string contents_; + + Status Close() override { return Status::OK(); } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Append(const StringPiece& slice) override { + contents_.append(slice.data(), slice.size()); + return Status::OK(); + } + }; + + class StringSource : public RandomAccessFile { + public: + StringPiece contents_; + mutable bool force_error_; + mutable bool returned_partial_; + StringSource() : force_error_(false), returned_partial_(false) {} + + Status Read(uint64 offset, size_t n, StringPiece* result, + char* scratch) const override { + EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error"; + + if (force_error_) { + force_error_ = false; + returned_partial_ = true; + return errors::DataLoss("read error"); + } + + if (offset >= contents_.size()) { + return errors::OutOfRange("end of file"); + } + + if (contents_.size() < offset + n) { + n = contents_.size() - offset; + returned_partial_ = true; + } + *result = StringPiece(contents_.data() + offset, n); + return Status::OK(); + } + }; + StringDest dest_; StringSource source_; bool reading_; @@ -109,9 +104,7 @@ class RecordioTest : public ::testing::Test { public: RecordioTest() - : dest_(&contents_), - source_(&contents_), - reading_(false), + : reading_(false), readpos_(0), writer_(new RecordWriter(&dest_)), reader_(new RecordReader(&source_)) {} @@ -126,11 +119,12 @@ class RecordioTest : public ::testing::Test { TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg))); } - size_t WrittenBytes() const { return contents_.size(); } + size_t WrittenBytes() const { return dest_.contents_.size(); } string Read() { if (!reading_) { reading_ = true; + source_.contents_ = StringPiece(dest_.contents_); } string record; Status s = reader_->ReadRecord(&readpos_, &record); @@ -143,20 +137,26 @@ class RecordioTest : public ::testing::Test { } } - void IncrementByte(int offset, int delta) { contents_[offset] += delta; } + void IncrementByte(int offset, int delta) { + dest_.contents_[offset] += delta; + } - void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; } + void SetByte(int offset, char new_byte) { + dest_.contents_[offset] = new_byte; + } - void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); } + void ShrinkSize(int bytes) { + dest_.contents_.resize(dest_.contents_.size() - bytes); + } void FixChecksum(int header_offset, int len) { // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len); + uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len); crc = crc32c::Mask(crc); - core::EncodeFixed32(&contents_[header_offset], crc); + core::EncodeFixed32(&dest_.contents_[header_offset], crc); } - void ForceError() { source_.force_error(); } + void ForceError() { source_.force_error_ = true; } void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; } @@ -165,6 +165,7 @@ class RecordioTest : public ::testing::Test { Write("bar"); Write(BigString("x", 10000)); reading_ = true; + source_.contents_ = StringPiece(dest_.contents_); uint64 offset = WrittenBytes() + offset_past_end; string record; Status s = reader_->ReadRecord(&offset, &record); @@ -216,100 +217,16 @@ TEST_F(RecordioTest, RandomRead) { ASSERT_EQ("EOF", Read()); } -void TestNonSequentialReads(const RecordWriterOptions& writer_options, - const RecordReaderOptions& reader_options) { - string contents; - StringDest dst(&contents); - RecordWriter writer(&dst, writer_options); - for (int i = 0; i < 10; ++i) { - TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i; - } - TF_ASSERT_OK(writer.Close()); - - StringSource file(&contents); - RecordReader reader(&file, reader_options); - - string record; - // First read sequentially to fill in the offsets table. - uint64 offsets[10] = {0}; - uint64 offset = 0; - for (int i = 0; i < 10; ++i) { - offsets[i] = offset; - TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i; - } - - // Read randomly: First go back to record #3 then forward to #8. - offset = offsets[3]; - TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); - EXPECT_EQ("3.", record); - EXPECT_EQ(offsets[4], offset); - - offset = offsets[8]; - TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); - EXPECT_EQ("8.", record); - EXPECT_EQ(offsets[9], offset); -} - -TEST_F(RecordioTest, NonSequentialReads) { - TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions()); -} - -TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) { - RecordReaderOptions options; - options.buffer_size = 1 << 10; - TestNonSequentialReads(RecordWriterOptions(), options); -} - -TEST_F(RecordioTest, NonSequentialReadsWithCompression) { - TestNonSequentialReads( - RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), - RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); -} - // Tests of all the error paths in log_reader.cc follow: -void AssertHasSubstr(StringPiece s, StringPiece expected) { +static void AssertHasSubstr(StringPiece s, StringPiece expected) { EXPECT_TRUE(str_util::StrContains(s, expected)) << s << " does not contain " << expected; } -void TestReadError(const RecordWriterOptions& writer_options, - const RecordReaderOptions& reader_options) { - const string wrote = BigString("well hello there!", 100); - string contents; - StringDest dst(&contents); - TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote)); - - StringSource file(&contents); - RecordReader reader(&file, reader_options); - - uint64 offset = 0; - string read; - file.force_error(); - Status status = reader.ReadRecord(&offset, &read); - ASSERT_TRUE(errors::IsDataLoss(status)); - ASSERT_EQ(0, offset); - - // A failed Read() shouldn't update the offset, and thus a retry shouldn't - // lose the record. - status = reader.ReadRecord(&offset, &read); - ASSERT_TRUE(status.ok()) << status; - EXPECT_GT(offset, 0); - EXPECT_EQ(wrote, read); -} - TEST_F(RecordioTest, ReadError) { - TestReadError(RecordWriterOptions(), RecordReaderOptions()); -} - -TEST_F(RecordioTest, ReadErrorWithBuffering) { - RecordReaderOptions options; - options.buffer_size = 1 << 20; - TestReadError(RecordWriterOptions(), options); -} - -TEST_F(RecordioTest, ReadErrorWithCompression) { - TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), - RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); + Write("foo"); + ForceError(); + AssertHasSubstr(Read(), "Data loss"); } TEST_F(RecordioTest, CorruptLength) { @@ -340,6 +257,5 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } -} // namespace } // namespace io } // namespace tensorflow diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc index bf8dcf0988c..984fbc2810c 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.cc +++ b/tensorflow/core/lib/io/zlib_inputstream.cc @@ -25,9 +25,8 @@ ZlibInputStream::ZlibInputStream( InputStreamInterface* input_stream, size_t input_buffer_bytes, // size of z_stream.next_in buffer size_t output_buffer_bytes, // size of z_stream.next_out buffer - const ZlibCompressionOptions& zlib_options, bool owns_input_stream) - : owns_input_stream_(owns_input_stream), - input_stream_(input_stream), + const ZlibCompressionOptions& zlib_options) + : input_stream_(input_stream), input_buffer_capacity_(input_buffer_bytes), output_buffer_capacity_(output_buffer_bytes), z_stream_input_(new Bytef[input_buffer_capacity_]), @@ -42,14 +41,10 @@ ZlibInputStream::~ZlibInputStream() { if (z_stream_) { inflateEnd(z_stream_.get()); } - if (owns_input_stream_) { - delete input_stream_; - } } Status ZlibInputStream::Reset() { TF_RETURN_IF_ERROR(input_stream_->Reset()); - inflateEnd(z_stream_.get()); InitZlibBuffer(); bytes_read_ = 0; return Status::OK(); diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h index 6099e2455d4..9c7e14441ce 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.h +++ b/tensorflow/core/lib/io/zlib_inputstream.h @@ -40,13 +40,10 @@ class ZlibInputStream : public InputStreamInterface { // Create a ZlibInputStream for `input_stream` with a buffer of size // `input_buffer_bytes` bytes for reading contents from `input_stream` and // another buffer with size `output_buffer_bytes` for caching decompressed - // contents. - // - // Takes ownership of `input_stream` iff `owns_input_stream` is true. + // contents. Does *not* take ownership of "input_stream". ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes, size_t output_buffer_bytes, - const ZlibCompressionOptions& zlib_options, - bool owns_input_stream = false); + const ZlibCompressionOptions& zlib_options); ~ZlibInputStream(); @@ -68,8 +65,7 @@ class ZlibInputStream : public InputStreamInterface { private: void InitZlibBuffer(); - const bool owns_input_stream_; - InputStreamInterface* input_stream_; + InputStreamInterface* input_stream_; // Not owned size_t input_buffer_capacity_; // Size of z_stream_input_ size_t output_buffer_capacity_; // Size of z_stream_output_ char* next_unread_byte_; // Next unread byte in z_stream_output_ From b7cca088e90b4c2a28c1038980aa09240584e382 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Thu, 19 Apr 2018 18:12:57 -0700 Subject: [PATCH 0482/1734] Respect any device filters in {Create,Delete}WorkerSessions(). This is another step towards enabling us to turn on explicit worker sessions for all master sessions. PiperOrigin-RevId: 193605565 --- tensorflow/core/distributed_runtime/master.cc | 6 +++++- tensorflow/core/distributed_runtime/master_env.h | 3 ++- tensorflow/core/distributed_runtime/master_session.cc | 9 +++++---- tensorflow/core/distributed_runtime/master_session.h | 6 +++++- .../core/distributed_runtime/rpc/grpc_server_lib.cc | 4 +++- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc index f47502e844f..288656e7f80 100644 --- a/tensorflow/core/distributed_runtime/master.cc +++ b/tensorflow/core/distributed_runtime/master.cc @@ -417,9 +417,13 @@ void Master::CreateSession(const CreateSessionRequest* req, SessionOptions options; options.config = req->config(); + std::vector filtered_worker_list; + DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_, + worker_cache, &filtered_worker_list); + MasterSession* session = env_->master_session_factory( options, env_, std::move(remote_devices), std::move(worker_cache_ptr), - std::move(device_set)); + std::move(device_set), std::move(filtered_worker_list)); GraphDef* gdef = const_cast(req)->mutable_graph_def(); diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h index 178c5b40ee1..16f4d93c8b4 100644 --- a/tensorflow/core/distributed_runtime/master_env.h +++ b/tensorflow/core/distributed_runtime/master_env.h @@ -83,7 +83,8 @@ struct MasterEnv { SessionOptions, MasterEnv*, std::unique_ptr>>, std::unique_ptr, - std::unique_ptr device_set)> + std::unique_ptr device_set, + std::vector filtered_worker_list)> master_session_factory; std::functionReleaseWorker(part.name, part.worker); + part.worker = nullptr; } return s; } @@ -1119,6 +1120,7 @@ MasterSession::MasterSession( std::unique_ptr>> remote_devs, std::unique_ptr worker_cache, std::unique_ptr device_set, + std::vector filtered_worker_list, StatsPublisherFactory stats_publisher_factory) : session_opts_(opt), env_(env), @@ -1126,6 +1128,7 @@ MasterSession::MasterSession( remote_devs_(std::move(remote_devs)), worker_cache_(std::move(worker_cache)), devices_(std::move(device_set)), + filtered_worker_list_(std::move(filtered_worker_list)), stats_publisher_factory_(std::move(stats_publisher_factory)), graph_version_(0), run_graphs_(5), @@ -1183,9 +1186,8 @@ Status MasterSession::Create(GraphDef* graph_def, Status MasterSession::CreateWorkerSessions( const WorkerCacheFactoryOptions& options) { - std::vector worker_names; + const std::vector worker_names = filtered_worker_list_; WorkerCacheInterface* worker_cache = get_worker_cache(); - worker_cache->ListWorkers(&worker_names); struct WorkerGroup { // The worker name. (Not owned.) @@ -1263,8 +1265,7 @@ Status MasterSession::CreateWorkerSessions( Status MasterSession::DeleteWorkerSessions() { WorkerCacheInterface* worker_cache = get_worker_cache(); - std::vector worker_names; - worker_cache->ListWorkers(&worker_names); + const std::vector& worker_names = filtered_worker_list_; struct WorkerGroup { // The worker name. (Not owned.) diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h index a05419904f5..ec34e20b79a 100644 --- a/tensorflow/core/distributed_runtime/master_session.h +++ b/tensorflow/core/distributed_runtime/master_session.h @@ -52,6 +52,7 @@ class MasterSession : public core::RefCounted { std::unique_ptr>> remote_devs, std::unique_ptr worker_cache, std::unique_ptr device_set, + std::vector filtered_worker_list, StatsPublisherFactory stats_publisher_factory); // Initialize the MasterSession for "def". Must be called before Extend(), @@ -130,6 +131,10 @@ class MasterSession : public core::RefCounted { // The device set used by this session. std::unique_ptr devices_; + // The (partial device) names of remote worker tasks that this + // session will contact. + const std::vector filtered_worker_list_; + StatsPublisherFactory stats_publisher_factory_; std::atomic_ulong last_access_time_usec_; @@ -212,7 +217,6 @@ class MasterSession : public core::RefCounted { // workers. Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def); - // TODO(b/36574172): Always use Create/DeleteWorkerSession. bool should_delete_worker_sessions_ = false; Status DeleteWorkerSessions(); diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index be191035821..488dcde9f5d 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -222,10 +222,12 @@ Status GrpcServer::Init( SessionOptions options, const MasterEnv* env, std::unique_ptr>> remote_devs, std::unique_ptr worker_cache, - std::unique_ptr device_set) { + std::unique_ptr device_set, + std::vector filtered_worker_list) { options.config.MergeFrom(config); return new MasterSession(options, env, std::move(remote_devs), std::move(worker_cache), std::move(device_set), + std::move(filtered_worker_list), stats_factory); }; master_env_.worker_cache_factory = From 4f8768319cfa56c25973cc66d920146ad454bd97 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 18:17:02 -0700 Subject: [PATCH 0483/1734] Optimize Graph function library. PiperOrigin-RevId: 193605910 --- tensorflow/core/grappler/optimizers/BUILD | 4 + .../grappler/optimizers/function_optimizer.cc | 126 ++++++- .../grappler/optimizers/function_optimizer.h | 6 +- .../optimizers/function_optimizer_test.cc | 32 +- .../grappler/optimizers/meta_optimizer.cc | 350 +++++++++++------- .../core/grappler/optimizers/meta_optimizer.h | 33 +- .../optimizers/meta_optimizer_test.cc | 172 ++++++++- tensorflow/core/grappler/utils/functions.cc | 12 +- tensorflow/core/grappler/utils/functions.h | 40 +- .../core/grappler/utils/functions_test.cc | 8 +- 10 files changed, 575 insertions(+), 208 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index a371186fe64..3ab8d8f584c 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -518,11 +518,13 @@ cc_library( ":loop_optimizer", ":memory_optimizer", ":model_pruner", + "//tensorflow/core:core_cpu_base", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler/utils:colocation", + "//tensorflow/core/grappler/utils:functions", "//tensorflow/core/grappler/utils:topological_sort", ], ) @@ -539,9 +541,11 @@ tf_cuda_cc_test( "//tensorflow/core:tensorflow", "//tensorflow/core:test", "//tensorflow/core:test_main", + "//tensorflow/core:testlib", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder", + "//tensorflow/core/grappler/utils:grappler_test", ], ) diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index d008a9719fe..950933b9335 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op_def.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/graph/graph_constructor.h" @@ -75,12 +76,10 @@ string UniqueSpecializedFunctionName(const FunctionDef& func, class FunctionOptimizerContext { public: - explicit FunctionOptimizerContext(const GrapplerItem& item, - RewriterConfig::Toggle opt_level) - : opt_level_(opt_level), - function_library_(FunctionLibraryDefinition(OpRegistry::Global(), - item.graph.library())) { - InitializeInlinedFunctions(item); + explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level, + const GrapplerItem& item) + : function_library_(OpRegistry::Global(), item.graph.library()) { + InitializeInlinedFunctions(opt_level, item); } const FunctionLibraryDefinition& function_library() const { @@ -101,8 +100,9 @@ class FunctionOptimizerContext { } private: - void InitializeInlinedFunctions(const GrapplerItem& item) { - bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE; + void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level, + const GrapplerItem& item) { + bool aggressive = opt_level == RewriterConfig::AGGRESSIVE; for (const FunctionDef& func : item.graph.library().function()) { // Can't create IdentityN nodes with no input or output: skip these @@ -120,7 +120,6 @@ class FunctionOptimizerContext { } } - RewriterConfig::Toggle opt_level_; FunctionLibraryDefinition function_library_; // Functions that can be inlined into optimized graph. std::unordered_map inlined_functions_; @@ -128,9 +127,93 @@ class FunctionOptimizerContext { TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext); }; +// Return trimmed FunctionDefLibrary with functions that are reachable from +// the optimized graph. +FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib, + const GraphDef& optimized_graph) { + // Functions that are reachable from the optimized graph. + std::unordered_set keep_funcs; + + std::vector func_queue; + func_queue.reserve(flib.num_functions()); + + // Add registered and not already processed functions to the queue by name. + const auto add_to_func_queue = [&](const string& func_name) { + const FunctionDef* func = flib.Find(func_name); + if (func && keep_funcs.find(func_name) == keep_funcs.end()) { + func_queue.push_back(func); + } + }; + + // Find all the functions that are reachable from the given node. + const auto add_node_to_func_queue = [&](const NodeDef& node) { + // Node itself can be a call to the function. + add_to_func_queue(node.op()); + + // Or node can have an attribute referencing a function. + for (const auto& attr : node.attr()) { + const auto& attr_value = attr.second; + + // 1. AttrValue.func + if (attr_value.has_func()) { + add_to_func_queue(attr_value.func().name()); + } + + // 2. AttrValue.ListValue.func + if (attr_value.has_list()) { + for (const auto& func : attr_value.list().func()) { + add_to_func_queue(func.name()); + } + } + } + }; + + // Add all functions that are directly called from the optimized graph. + const auto& graph_nodes = optimized_graph.node(); + std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue); + + // Process all reachable functions. + while (!func_queue.empty()) { + const FunctionDef* func = func_queue.back(); + func_queue.pop_back(); + + const string& func_name = func->signature().name(); + keep_funcs.insert(func_name); + + // Find all the functions that called from the function body. + const auto& func_body = func->node_def(); + std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue); + + // Check if the function has a registered gradient. + const string grad_func_name = flib.FindGradient(func_name); + if (!grad_func_name.empty()) add_to_func_queue(grad_func_name); + } + + FunctionDefLibrary lib; + for (const string& func_name : keep_funcs) { + const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name)); + *lib.add_function() = *func; + + const string grad_func_name = flib.FindGradient(func_name); + if (!grad_func_name.empty()) { + GradientDef* gd = lib.add_gradient(); + gd->set_function_name(func_name); + gd->set_gradient_func(grad_func_name); + } + } + + VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions (" + << static_cast(keep_funcs.size() - flib.num_functions()) << ")"; + + return lib; +} + Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, FunctionOptimizerContext* ctx, GraphDef* optimized_graph) { + VLOG(2) << "Specialize function instantiation: " + << SummarizeNodeDef(func_node); + const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -141,20 +224,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); // TODO(ezhulenev): Push down const inputs and known input shapes. - FunctionDef specialized; - TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized)); + FunctionDef specialized_func; + TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func)); // Find a name for specialized function. const string specialized_func_name = UniqueSpecializedFunctionName(func, func_node, flib); - specialized.mutable_signature()->set_name(specialized_func_name); - auto* specialized_attr = specialized.mutable_attr(); + specialized_func.mutable_signature()->set_name(specialized_func_name); + auto* specialized_attr = specialized_func.mutable_attr(); (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true); // Add specialized function to the library. TF_RETURN_IF_ERROR( - ctx->mutable_function_library().AddFunctionDef(specialized)); + ctx->mutable_function_library().AddFunctionDef(specialized_func)); // Add a function call node for the specialized function. NodeDef* specialized_func_node = optimized_graph->add_node(); @@ -226,6 +309,8 @@ Status HookInlinedFunctionOutputs( Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionOptimizerContext& ctx, GraphDef* optimized_graph) { + VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node); + const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -359,6 +444,8 @@ class SymbolicGradientEnv { Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, GraphDef* inlined_graph) { + VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node); + GraphDef graph_def; // Create a node to anchor the gradient inputs @@ -454,13 +541,16 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, GraphDef* optimized_graph) { + VLOG(2) << "Optimize function library: id=" << item.id; + // Nothing to do here. if (item.graph.library().function_size() == 0) { + VLOG(3) << "Skip Grappler item with empty function library"; *optimized_graph = item.graph; return Status::OK(); } - FunctionOptimizerContext ctx(item, opt_level_); + FunctionOptimizerContext ctx(opt_level_, item); SymbolicGradientEnv env(item.graph.versions().producer(), item.graph.library()); @@ -506,9 +596,11 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph->add_node() = node; } - // TODO(bsteiner): trim the library to remove unused function definitions *optimized_graph->mutable_versions() = item.graph.versions(); - *optimized_graph->mutable_library() = ctx.function_library().ToProto(); + *optimized_graph->mutable_library() = + options_.enable_trim_function_library + ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph) + : ctx.function_library().ToProto(); return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h index c555fadf83a..e307b4e533f 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.h +++ b/tensorflow/core/grappler/optimizers/function_optimizer.h @@ -26,8 +26,9 @@ namespace grappler { // operations to make the overall graph more efficient. class FunctionOptimizer : public GraphOptimizer { public: - FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {} - ~FunctionOptimizer() override {} + explicit FunctionOptimizer(RewriterConfig::Toggle opt_level) + : opt_level_(opt_level) {} + ~FunctionOptimizer() override = default; string name() const override { return "function_optimizer"; }; @@ -44,6 +45,7 @@ class FunctionOptimizer : public GraphOptimizer { bool enable_function_inlining = true; bool enable_function_specialization = true; bool enable_symbolic_gradient_inlining = true; + bool enable_trim_function_library = true; }; RewriterConfig::Toggle opt_level_; diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc index fb006d48688..6147e8a27c0 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc @@ -31,20 +31,8 @@ constexpr char kDevice[] = "/device:CPU:0"; class FunctionOptimizerTest : public GrapplerTest { protected: - void DisableAll(FunctionOptimizer* optimizer) { - optimizer->options_.enable_function_inlining = false; + void DisableFunctionSpecialization(FunctionOptimizer* optimizer) { optimizer->options_.enable_function_specialization = false; - optimizer->options_.enable_symbolic_gradient_inlining = false; - } - - void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) { - DisableAll(optimizer); - optimizer->options_.enable_function_inlining = true; - } - - void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) { - DisableAll(optimizer); - optimizer->options_.enable_function_specialization = true; } }; @@ -352,7 +340,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); - EnableOnlyFunctionInlining(&optimizer); + DisableFunctionSpecialization(&optimizer); // do not specialize noinline func const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( @@ -626,14 +614,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); - EnableOnlyFunctionSpecialization(&optimizer); - // Mark XTimesTwo as noinline + // Mark XTimesTwo as noinline. FunctionDef x_times_two = test::function::XTimesTwo(); (*x_times_two.mutable_attr())["_noinline"].set_b(true); std::vector function_library = {x_times_two}; - // Build a graph to compute y = XTimesTwo(x) + // Build a graph to compute y = XTimesTwo(x). GrapplerItem item; item.graph = test::function::GDef( {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice), @@ -644,12 +631,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { GraphDef output; TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output)); - // Make sure that specialized function was added to the library - EXPECT_EQ(2, output.library().function_size()); + // Make sure that specialized function was added to the library and original + // function was removed. + EXPECT_EQ(1, output.library().function_size()); EXPECT_EQ("XTimesTwo_specialized_for_y", - output.library().function(1).signature().name()); + output.library().function(0).signature().name()); - // And 'y' node is calling specialized function + // And 'y' node is calling specialized function. int count = 0; for (const NodeDef& node : output.node()) { if (node.name() == "y" && count++) { @@ -658,7 +646,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { } EXPECT_EQ(1, count); - // And that graph evaluation yields the same result + // And that graph evaluation yields the same result. Tensor pi = test::AsScalar(3.14f); item.fetch = {"z"}; item.feed.emplace_back("x", pi); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 558b8a77e8a..22799311bcd 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" +#include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h" @@ -29,6 +30,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/memory_optimizer.h" #include "tensorflow/core/grappler/optimizers/model_pruner.h" #include "tensorflow/core/grappler/utils/colocation.h" +#include "tensorflow/core/grappler/utils/functions.h" #include "tensorflow/core/grappler/utils/topological_sort.h" #include "tensorflow/core/lib/core/status.h" @@ -36,6 +38,9 @@ namespace tensorflow { namespace grappler { namespace { + +constexpr int kDefaultNumberOfIterations = 1; + int64 NumEdges(const GraphDef& graph) { int64 num_edges = 0; for (const auto& node : graph.node()) { @@ -50,144 +55,138 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) { NumEdges(after), " edges (", NumEdges(after) - NumEdges(before), ")"); } -} // namespace -std::unique_ptr MetaOptimizer::NewOptimizer( - const string& optimizer) { - std::unique_ptr graph_optimizer; - if (optimizer == "pruning") { - graph_optimizer.reset(new ModelPruner()); - } - if (optimizer == "function") { - graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization())); - } - if (optimizer == "constfold") { - graph_optimizer.reset(new ConstantFolding(cpu_device_)); - } - if (optimizer == "layout") { - graph_optimizer.reset(new LayoutOptimizer()); - } - if (optimizer == "memory") { - graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL)); - } - if (optimizer == "arithmetic") { - graph_optimizer.reset( - new ArithmeticOptimizer(cfg_.arithmetic_optimization())); - } - if (optimizer == "autoparallel") { - graph_optimizer.reset( - new AutoParallel(cfg_.auto_parallel().num_replicas())); - } - if (optimizer == "loop") { - graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization())); - } - if (optimizer == "dependency") { - graph_optimizer.reset( - new DependencyOptimizer(cfg_.dependency_optimization())); - } - if (optimizer == "debug_stripper") { - graph_optimizer.reset(new DebugStripper()); - } - return graph_optimizer; +int NumIterations(const RewriterConfig& cfg) { + return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS + ? kDefaultNumberOfIterations + : cfg.meta_optimizer_iterations(); } -Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, - GraphDef* optimized_graph) { - std::vector> optimizers; - if (cfg_.optimizers().empty()) { - if (!cfg_.disable_model_pruning()) { - optimizers.push_back(std::unique_ptr(new ModelPruner())); - } - if (cfg_.function_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new FunctionOptimizer(cfg_.function_optimization()))); - } - if (cfg_.debug_stripper() == RewriterConfig::ON) { - optimizers.push_back( - std::unique_ptr(new DebugStripper())); - } - if (cfg_.constant_folding() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new ConstantFolding(cfg_.constant_folding(), cpu_device_))); - } - if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new ArithmeticOptimizer(cfg_.arithmetic_optimization()))); - } - if (cfg_.loop_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new LoopOptimizer(cfg_.loop_optimization()))); - } - if (cfg_.dependency_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new DependencyOptimizer(cfg_.dependency_optimization()))); - } - if (cfg_.layout_optimizer() != RewriterConfig::OFF) { - optimizers.push_back( - std::unique_ptr(new LayoutOptimizer())); - } - if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { - if (cfg_.memory_optimizer_target_node_name_scope().empty()) { - optimizers.push_back(std::unique_ptr( - // Use the default target node name prefix "gradients/" - new MemoryOptimizer(cfg_.memory_optimization()))); - } else { - optimizers.push_back( - std::unique_ptr(new MemoryOptimizer( - cfg_.memory_optimization(), - cfg_.memory_optimizer_target_node_name_scope()))); - } - } - if (cfg_.auto_parallel().enable()) { - optimizers.push_back(std::unique_ptr( - new AutoParallel(cfg_.auto_parallel().num_replicas()))); - } - } else { - const std::set available_optimizers = { - "pruning", "function", "constfold", "layout", - "memory", "autoparallel", "arithmetic", "loop", - "dependency", "debug_stripper"}; - std::vector custom_optimizer_names; - for (const auto& optimizer_name : cfg_.optimizers()) { - if (available_optimizers.find(optimizer_name) != - available_optimizers.end()) { - optimizers.push_back(NewOptimizer(optimizer_name)); - } else { - custom_optimizer_names.push_back(optimizer_name); - } - } - // Now run the custom optimizers. - for (const auto& optimizer_name : custom_optimizer_names) { - std::unique_ptr opt = - CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); - if (opt == nullptr) continue; - TF_RETURN_IF_ERROR(opt->Init()); - optimizers.push_back(std::move(opt)); +// Check if optimizer is allowed to run only once. +int IsRunOnceOptimizer(const string& name) { return name == "layout"; } + +} // namespace + +std::unique_ptr MetaOptimizer::MakeNewOptimizer( + const string& optimizer) const { +#define MK_OPT(NAME, VALUE) \ + if (optimizer == NAME) return std::unique_ptr(VALUE) + + MK_OPT("pruning", new ModelPruner()); + MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization())); + MK_OPT("constfold", new ConstantFolding(cpu_device_)); + MK_OPT("layout", new LayoutOptimizer()); + MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL)); + MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization())); + MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas())); + MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization())); + MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization())); + MK_OPT("debug_stripper", new DebugStripper()); + + return std::unique_ptr(); +#undef MK_OPT +} + +Status MetaOptimizer::InitializeOptimizers( + std::vector>* optimizers) const { + if (!cfg_.disable_model_pruning()) { + optimizers->emplace_back(new ModelPruner()); + } + if (cfg_.function_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new FunctionOptimizer(cfg_.function_optimization())); + } + if (cfg_.debug_stripper() == RewriterConfig::ON) { + optimizers->emplace_back(new DebugStripper()); + } + if (cfg_.constant_folding() != RewriterConfig::OFF) { + optimizers->emplace_back( + new ConstantFolding(cfg_.constant_folding(), cpu_device_)); + } + if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new ArithmeticOptimizer(cfg_.arithmetic_optimization())); + } + if (cfg_.loop_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization())); + } + if (cfg_.dependency_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new DependencyOptimizer(cfg_.dependency_optimization())); + } + if (cfg_.layout_optimizer() != RewriterConfig::OFF) { + optimizers->emplace_back(new LayoutOptimizer()); + } + if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { + if (cfg_.memory_optimizer_target_node_name_scope().empty()) { + optimizers->emplace_back( + // Use the default target node name prefix "gradients/" + new MemoryOptimizer(cfg_.memory_optimization())); + } else { + optimizers->emplace_back( + new MemoryOptimizer(cfg_.memory_optimization(), + cfg_.memory_optimizer_target_node_name_scope())); } } + if (cfg_.auto_parallel().enable()) { + optimizers->emplace_back( + new AutoParallel(cfg_.auto_parallel().num_replicas())); + } + return Status::OK(); +} + +Status MetaOptimizer::InitializeOptimizersByName( + std::vector>* optimizers) const { + for (const string& optimizer_name : cfg_.optimizers()) { + auto optimizer = MakeNewOptimizer(optimizer_name); + if (optimizer) { + VLOG(2) << "Registered default graph optimizer: " << optimizer_name; + optimizers->push_back(std::move(optimizer)); + continue; + } + + auto custom_optimizer = + CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); + + if (custom_optimizer) { + VLOG(2) << "Registered custom graph optimizer: " << optimizer_name; + TF_RETURN_IF_ERROR(custom_optimizer->Init()); + optimizers->push_back(std::move(custom_optimizer)); + } else { + VLOG(2) << "Can't register an optimizer by name: " << optimizer_name; + } + } + return Status::OK(); +} + +Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph) { + VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id; + + std::vector> optimizers; + bool register_by_name = !cfg_.optimizers().empty(); + TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers) + : InitializeOptimizers(&optimizers)); if (optimizers.empty()) { *optimized_graph = item.graph; return Status::OK(); } - // Some optimizers should be run only once. - const std::set run_once_optimizers = {"layout"}; - bool already_optimized = false; - const int num_iterations = - cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS - ? 1 - : cfg_.meta_optimizer_iterations(); + // Invariant: optimized_graph contains the most recently optimized version of + // the graph. GrapplerItem optimized_item = item; optimized_graph->Swap(&optimized_item.graph); - for (int iteration = 0; iteration < num_iterations; ++iteration) { - VLOG(1) << "Starting optimization iteration " << iteration + 1; + + GraphOptimizationResult optimization_result(item.id); + + for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) { + VLOG(4) << "Starting optimization iteration " << iteration + 1; + for (const auto& optimizer : optimizers) { - // Invariant: optimized_graph contains the most recently optimized - // version of the graph. - if (iteration > 0 && run_once_optimizers.count(optimizer->name())) { - continue; - } + // Some optimizers can run only once. + if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue; + uint64 start_us = Env::Default()->NowMicros(); // This swaps the current optimized_graph into optimized item and // resets optimized_graph to an empty graph. @@ -195,45 +194,114 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph = GraphDef(); Status status = optimizer->Optimize(cluster, optimized_item, optimized_graph); - uint64 end_us = Env::Default()->NowMicros(); - float duration_ms = (end_us - start_us) / 1000.0f; + string result; if (!status.ok()) { - VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": " - << status.ToString(); optimized_graph->Swap(&optimized_item.graph); result = status.ToString(); } else { - already_optimized = true; + optimization_result.is_optimized = true; + float duration_ms = (end_us - start_us) / 1000.0f; result = strings::StrCat( - optimizer->name(), ": ", PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph), ", time = ", duration_ms, "ms."); } - result_.emplace_back(optimizer->name(), result); - VLOG(1) << result; + VLOG(4) << optimizer->name() << ": " << result; + + OptimizerResult optimizer_result{optimizer->name(), result}; + optimization_result.results.push_back(optimizer_result); } } - if (already_optimized) { + // Record graph optimization result. + optimization_results_.push_back(optimization_result); + + if (optimization_result.is_optimized) { TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph)); ReassignColocation(optimized_graph); - // Make sure that the optimizers preserved the graph version and library. - DCHECK_GE(optimized_graph->library().function_size(), - item.graph.library().function_size()); - DCHECK_GE(optimized_graph->library().gradient_size(), - item.graph.library().gradient_size()); + // Make sure that the optimizers preserved the graph version. DCHECK_EQ(optimized_graph->versions().producer(), item.graph.versions().producer()); } + + return Status::OK(); +} + +Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph) { + optimization_results_.clear(); + + // 1. Optimize main graph + TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph)); + + // 2. Optimize function library + FunctionLibraryDefinition flib(OpRegistry::Global(), + optimized_graph->library()); + + // Optimize each function only once. + std::unordered_set optimized_funcs; + bool optimize_function_library = true; + + while (optimize_function_library) { + optimize_function_library = false; + + for (const FunctionDef& func : optimized_graph->library().function()) { + const string& func_name = func.signature().name(); + + // Skip already optimized functions. + if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue; + + // Skip parametrized functions (function type or body is defined only at + // function call time by caller node attributes). + if (IsParametrized(func)) continue; + + VLOG(3) << "Optimize function: function=" << func_name; + + // Function optimization might specialize nested function calls, so we + // have to reset the flag and do at least one more pass over the library. + optimize_function_library = true; + optimized_funcs.insert(func_name); + + // Make a GrapplerItem from a FunctionDef. + GrapplerFunctionItem func_item; + TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item)); + + // Optimize function body graph. + GraphDef optimized_func_graph; + TF_RETURN_IF_ERROR( + OptimizeGraph(cluster, func_item, &optimized_func_graph)); + + // Function body optimization might have created new specialized + // functions, add them to the library. + TF_RETURN_IF_ERROR(flib.AddLibrary(optimized_func_graph.library())); + + // Convert optimized graph back to FunctionDef. + FunctionDef optimized_func; + func_item.SwapFunctionBody(std::move(optimized_func_graph)); + TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func)); + + // Replace optimized function with a new FunctionDef. + TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name)); + TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func)); + } + + // If optimized at least one function, update the graph library. + if (optimize_function_library) { + *optimized_graph->mutable_library() = flib.ToProto(); + } + } + return Status::OK(); } void MetaOptimizer::PrintResult() { - for (const auto& result : result_) { - LOG(INFO) << "Return status of optimizer " << result.first << ": " - << result.second; + for (const GraphOptimizationResult& graph_result : optimization_results_) { + LOG(INFO) << "Optimization results for grappler item: " << graph_result.id; + for (const OptimizerResult& result : graph_result.results) { + LOG(INFO) << "Return status of optimizer " << result.optimizer_name + << ": " << result.result; + } } } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 382cfe51d42..7cf9a40c2d6 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer { public: MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg) : cpu_device_(cpu_device), cfg_(cfg) {} - ~MetaOptimizer() override {} + ~MetaOptimizer() override = default; string name() const override { return "meta_optimizer"; }; @@ -43,10 +43,37 @@ class MetaOptimizer : public GraphOptimizer { const GraphDef& optimized_graph, double result) override; private: - std::unique_ptr NewOptimizer(const string& optimizer); + std::unique_ptr MakeNewOptimizer( + const string& optimizer) const; + + // Initialize active optimizers from RewriterConfig toggles. + Status InitializeOptimizers( + std::vector>* optimizers) const; + // Initialize active optimizers from RewriterConfig optimizer names. + Status InitializeOptimizersByName( + std::vector>* optimizers) const; + + // Run optimization pass over a single GrapplerItem. Meta optimizer might run + // multiple such passes: 1) for the main graph 2) for the function library + Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph); + DeviceBase* const cpu_device_; // may be NULL RewriterConfig cfg_; - std::vector> result_; + + struct OptimizerResult { + string optimizer_name; + string result; + }; + + struct GraphOptimizationResult { + explicit GraphOptimizationResult(const string& id) : id(id) {} + string id; + bool is_optimized = false; + std::vector results; + }; + + std::vector optimization_results_; }; bool MetaOptimizerEnabled(const RewriterConfig& cfg); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc index d9a386b9be2..8793ad9633c 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc @@ -16,11 +16,14 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" #include "tensorflow/core/grappler/utils.h" +#include "tensorflow/core/grappler/utils/grappler_test.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -28,6 +31,8 @@ namespace tensorflow { namespace grappler { namespace { +constexpr char kDevice[] = "/device:CPU:0"; + class TestOptimizer : public CustomGraphOptimizer { public: static void SetOptimized(const bool flag_value) { optimized_ = flag_value; } @@ -56,7 +61,9 @@ bool TestOptimizer::optimized_; REGISTER_GRAPH_OPTIMIZER(TestOptimizer); -TEST(MetaOptimizerTest, RunsCustomOptimizer) { +class MetaOptimizerTest : public GrapplerTest {}; + +TEST_F(MetaOptimizerTest, RunsCustomOptimizer) { TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"}); GrapplerItem item; CHECK(fake_input.NextItem(&item)); @@ -72,7 +79,7 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) { EXPECT_TRUE(TestOptimizer::IsOptimized()); } -TEST(MetaOptimizerTest, RunOptimizersTwice) { +TEST_F(MetaOptimizerTest, RunOptimizersTwice) { TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"}); GrapplerItem item; CHECK(fake_input.NextItem(&item)); @@ -86,6 +93,167 @@ TEST(MetaOptimizerTest, RunOptimizersTwice) { TF_EXPECT_OK(status); } +TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) { + using test::function::NDef; + + // Enable ony function optimization. + RewriterConfig rewriter_config; + rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO); + rewriter_config.set_function_optimization(RewriterConfig::ON); + rewriter_config.add_optimizers("function"); + + MetaOptimizer optimizer(nullptr, rewriter_config); + + // Define function library: + // + // MyMul(x, y) = x * y + // *MySquare(x) = MyMul(x, x) + // *MyQuadratic(x) = MySquare(MySquare(x)) + // + // * - marked as noinline + + FunctionDef mul_func = FunctionDefHelper::Create( + "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"}, + {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}}, + /* Mapping between function returns and function node outputs. */ + {{"z", "mul:z:0"}}); + + FunctionDef square_func = FunctionDefHelper::Create( + "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"}, + {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}}, + /* Mapping between function returns and function node outputs. */ + {{"z", "my_mul:z:0"}}); + (*square_func.mutable_attr())["_noinline"].set_b(true); + + FunctionDef quadratic_func = FunctionDefHelper::Create( + "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"}, + {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}}, + {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}}, + /* Mapping between function returns and function node outputs. */ + {{"z", "quadratic:z:0"}}); + (*quadratic_func.mutable_attr())["_noinline"].set_b(true); + + // Tensorflow graph: + // + // a = tf.Placeholder(tf.float); + // b = tf.Placeholder(tf.int32); + // + // square = MySquare(a); // a^2 + // quadratic = MyQuadratic(b); // b^4 + GrapplerItem item; + item.graph = test::function::GDef( + {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice), + NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice), + // Calls into function library + NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice), + NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice), + // Forward outputs + NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice), + NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)}, + // FunctionLib + {mul_func, square_func, quadratic_func}); + + GraphDef output; + TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output)); + + FunctionLibraryDefinition optimized_flib(OpRegistry::Global(), + output.library()); + + // Specialized and optimized functions should be added to the graph. + EXPECT_EQ(6, optimized_flib.num_functions()); + + // MyQuadratic should be specialized once: + // 0. 'quadratic' node in the main graph + const string optimized_0 = "MyQuadratic_specialized_for_quadratic"; + + // MySquare should be specialized and optimized for 3 instantiations: + // 1. 'square' node in the main graph + // 2. 'square' node in the MyQuadratic specialization + // 3. 'quadratic' node in the MyQuadratic specialization + + const string optimized_1 = "MySquare_specialized_for_square"; + const string optimized_2 = "MySquare_specialized_for_square_1"; + const string optimized_3 = "MySquare_specialized_for_quadratic"; + + const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0); + const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1); + const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2); + const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3); + + ASSERT_NE(optimized_func_0, nullptr); + ASSERT_NE(optimized_func_1, nullptr); + ASSERT_NE(optimized_func_2, nullptr); + ASSERT_NE(optimized_func_3, nullptr); + + // Graph should call optimized function. + int count = 0; + for (const NodeDef& node : output.node()) { + if (node.name() == "square" && count++) { + EXPECT_EQ("MySquare_specialized_for_square", node.op()); + } else if (node.name() == "quadratic" && count++) { + EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op()); + } + } + EXPECT_EQ(2, count); + + // Specialized MySquare should call specialized functions. + count = 0; + for (const NodeDef& node : optimized_func_0->node_def()) { + if (node.name() == "square" && count++) { + EXPECT_EQ(optimized_2, node.op()); + } else if (node.name() == "quadratic" && count++) { + EXPECT_EQ(optimized_3, node.op()); + } + } + EXPECT_EQ(2, count); + + const std::vector optimized_funcs = { + optimized_func_1, optimized_func_1, optimized_func_3}; + + // MyMul should be inlined into all optimized versions of MySquare. + for (const FunctionDef* optimized_func : optimized_funcs) { + count = 0; + for (const NodeDef& node : optimized_func->node_def()) { + if (node.name() == "my_mul/inlined_inputs" && count++) { + EXPECT_EQ("IdentityN", node.op()); + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("x:0", node.input(0)); + EXPECT_EQ("x:0", node.input(1)); + } else if (node.name() == "my_mul/x" && count++) { + EXPECT_EQ("Identity", node.op()); + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0)); + } else if (node.name() == "my_mul/y" && count++) { + EXPECT_EQ("Identity", node.op()); + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0)); + } else if (node.name() == "my_mul/mul" && count++) { + EXPECT_EQ("Mul", node.op()); + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("my_mul/x:output:0", node.input(0)); + EXPECT_EQ("my_mul/y:output:0", node.input(1)); + } else if (node.name() == "my_mul" && count++) { + EXPECT_EQ("IdentityN", node.op()); + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("my_mul/mul:z:0", node.input(0)); + } + EXPECT_TRUE(node.device().empty()); + } + EXPECT_EQ(5, count); + } + + item.fetch = {"out_s", "out_q"}; + item.feed.emplace_back("a", test::AsScalar(2.0f)); + item.feed.emplace_back("b", test::AsScalar(4)); + auto tensors_expected = EvaluateFetchNodes(item); + + GrapplerItem optimized(item, std::move(output)); + auto tensors = EvaluateFetchNodes(optimized); + + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); + test::ExpectTensorEqual(tensors_expected[1], tensors[1]); +} + } // namespace } // namespace grappler } // namespace tensorflow diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc index 638fe1999a6..790809bc670 100644 --- a/tensorflow/core/grappler/utils/functions.cc +++ b/tensorflow/core/grappler/utils/functions.cc @@ -545,6 +545,12 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func, return Status::OK(); } +Status MakeGrapplerFunctionItem(const FunctionDef& func, + const FunctionLibraryDefinition& flib, + GrapplerFunctionItem* item) { + return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item); +} + // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Status RegisterGrapplerFunctionConnectivity( @@ -560,9 +566,9 @@ Status RegisterGrapplerFunctionConnectivity( return Status::OK(); } -Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func) { +Status MakeFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func) { func->mutable_signature()->set_name(item.id); func->mutable_signature()->set_is_stateful(item.is_stateful()); diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h index ab369bcad7c..5e8b6c69601 100644 --- a/tensorflow/core/grappler/utils/functions.h +++ b/tensorflow/core/grappler/utils/functions.h @@ -38,7 +38,8 @@ using AttrValueMap = std::unordered_map; // function body in place of function inputs and a resolved input data type. struct InputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence inputs of - // different data types + // different data types. + // TODO(ezhulenev): Support type parametrized inputs? string input_name; // name of the function input argument DataType data_type; // input data type bool is_ref; // if true, inputs are required to be refs @@ -53,7 +54,8 @@ struct InputArgExpansion { // tensors of a function body nodes and a resolved output data type struct OutputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence outputs of - // different data types + // different data types. + // TODO(ezhulenev): Support type parametrized outputs? string output_name; // name of the function output argument DataType data_type; // output data type bool is_ref; // if true, outputs are refs @@ -186,13 +188,6 @@ bool HasParametrizedBody(const FunctionDef& func); // Check if function has parametrized type or body. bool IsParametrized(const FunctionDef& func); -// Make a GrapplerFunctionItem from the function definition and attributes. -// Return error if the given function def cannot be converted. -Status MakeGrapplerFunctionItem( - const FunctionDef& func, - const std::unordered_map& func_instantiation_attr, - const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); - // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Use function library definition to // lookup function body nodes output names and ranges. @@ -200,11 +195,28 @@ Status RegisterGrapplerFunctionConnectivity( const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib, GrapplerFunctionConnectivity* connectivity); -// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function -// library definition to lookup function body nodes output names and ranges. -Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func); +// Make a GrapplerFunctionItem from the function definition and function +// instantiation attributes (caller node attributes). Returns error if the given +// function def cannot be converted (e.g. not all attributes are defined). +Status MakeGrapplerFunctionItem( + const FunctionDef& func, + const std::unordered_map& func_instantiation_attr, + const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); + +// Make a GrapplerFunction item from the function definition. Function must be +// fully defined (no type or body parametrization). +// TODO(ezhulenev): Support parametrized functions without fully defined +// instantiation attributes? Do we ever want to optimize parametrized function +// without specializing it to it's instantiation attributes (at least types)? +Status MakeGrapplerFunctionItem(const FunctionDef& func, + const FunctionLibraryDefinition& flib, + GrapplerFunctionItem* item); + +// Make a FunctionDef from the GrapplerFunctionItem. Use function library +// definition to lookup function body nodes output names and ranges. +Status MakeFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func); } // end namespace grappler } // end namespace tensorflow diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc index 54d235a8a46..6dfd49b9438 100644 --- a/tensorflow/core/grappler/utils/functions_test.cc +++ b/tensorflow/core/grappler/utils/functions_test.cc @@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) { EXPECT_EQ("two", cast.input(0)); } -TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { +TEST_F(FunctionsTest, MakeFunctionDef) { const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( // Name @@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); FunctionDef specialized; - TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); // Input and output types are resolved based on instantiation attributes. EXPECT_EQ("x", specialized.signature().input_arg(0).name()); @@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { EXPECT_EQ(2, count); } -TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) { +TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) { using test::function::NDef; FunctionDef mul_func = FunctionDefHelper::Create( @@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) { // Replace function body with identity function item.SwapFunctionBody(std::move(id_func_body)); FunctionDef specialized; - TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); // Check that graph body was updated. int count = 0; From 39a2787272f948a043a1ca103159307cfb0f7248 Mon Sep 17 00:00:00 2001 From: ImSheridan Date: Fri, 20 Apr 2018 09:20:38 +0800 Subject: [PATCH 0484/1734] Fix incorrect math equation renderings broken by backtick (#18386) * Fix incorrect `` typo format * Remove breaking ``` for math equations * fix one more typo * fix more math equation broken ` typos in py --- .../bayesflow/python/ops/monte_carlo_impl.py | 22 ++++++--------- .../factorization/python/ops/kmeans.py | 4 +-- .../python/contrib.bayesflow.monte_carlo.md | 28 ++++++++----------- tensorflow/python/ops/nn_ops.py | 2 +- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py index 48ff0835321..032b859d469 100644 --- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py +++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py @@ -44,15 +44,13 @@ def expectation_importance_sampler(f, n=None, seed=None, name='expectation_importance_sampler'): - r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`. + r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\). - With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns + With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns - ``` \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ], z_i ~ q,\\) \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\) \\(= E_p[f(Z)]\\) - ``` This integral is done in log-space with max-subtraction to better handle the often extreme values that `f(z) p(z) / q(z)` can take on. @@ -121,14 +119,12 @@ def expectation_importance_sampler_logspace( name='expectation_importance_sampler_logspace'): r"""Importance sampling with a positive function, in log-space. - With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`, + With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\), this `Op` returns - ``` \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ], z_i ~ q,\\) \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\) \\(= Log[E_p[f(Z)]]\\) - ``` This integral is done in log-space with max-subtraction to better handle the often extreme values that `f(z) p(z) / q(z)` can take on. @@ -196,13 +192,11 @@ def _logspace_mean(log_values): def expectation(f, samples, log_prob=None, use_reparametrization=True, axis=0, keep_dims=False, name=None): - """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`. + """Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\). This function computes the Monte-Carlo approximation of an expectation, i.e., - ```none \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j), x_j\ ~iid\ p(X)\\) - ``` where: @@ -216,8 +210,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, parameterless distribution (e.g., `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and expectation, i.e., - `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where - `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`. + grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where + S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\). However, if p is not reparameterized, TensorFlow's gradient will be incorrect since the chain-rule stops at samples of non-reparameterized distributions. @@ -296,7 +290,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, Args: f: Python callable which can return `f(samples)`. samples: `Tensor` of samples used to form the Monte-Carlo approximation of - `\\(E_p[f(X)]\\)`. A batch of samples should be indexed by `axis` + \\(E_p[f(X)]\\). A batch of samples should be indexed by `axis` dimensions. log_prob: Python callable which can return `log_prob(samples)`. Must correspond to the natural-logarithm of the pdf/pmf of each sample. Only @@ -317,7 +311,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, Returns: approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation - of `\\(E_p[f(X)]\\)`. + of \\(E_p[f(X)]\\). Raises: ValueError: if `f` is not a Python `callable`. diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py index bfe338c9f9a..9ffdd3ba5e8 100644 --- a/tensorflow/contrib/factorization/python/ops/kmeans.py +++ b/tensorflow/contrib/factorization/python/ops/kmeans.py @@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator): than `num_clusters`, a TensorFlow runtime error occurs. distance_metric: The distance metric used for clustering. One of: * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance - between vectors `u` and `v` is defined as `\\(||u - v||_2\\)` + between vectors `u` and `v` is defined as \\(||u - v||_2\\) which is the square root of the sum of the absolute squares of the elements' difference. * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors - `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`. + `u` and `v` is defined as \\(1 - (u . v) / (||u||_2 ||v||_2)\\). random_seed: Python integer. Seed for PRNG used to initialize centers. use_mini_batch: A boolean specifying whether to use the mini-batch k-means algorithm. See explanation above. diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md index f3db5857aec..74fe4a323aa 100644 --- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md +++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md @@ -6,43 +6,39 @@ Monte Carlo integration and helpers. ## Background Monte Carlo integration refers to the practice of estimating an expectation with -a sample mean. For example, given random variable `Z in \\(R^k\\)` with density `p`, +a sample mean. For example, given random variable Z in \\(R^k\\) with density `p`, the expectation of function `f` can be approximated like: -``` $$E_p[f(Z)] = \int f(z) p(z) dz$$ $$ ~ S_n := n^{-1} \sum_{i=1}^n f(z_i), z_i\ iid\ samples\ from\ p.$$ -``` -If `\\(E_p[|f(Z)|] < infinity\\)`, then `\\(S_n\\) --> \\(E_p[f(Z)]\\)` by the strong law of large -numbers. If `\\(E_p[f(Z)^2] < infinity\\)`, then `\\(S_n\\)` is asymptotically normal with -variance `\\(Var[f(Z)] / n\\)`. +If \\(E_p[|f(Z)|] < infinity\\), then \\(S_n\\) --> \\(E_p[f(Z)]\\) by the strong law of large +numbers. If \\(E_p[f(Z)^2] < infinity\\), then \\(S_n\\) is asymptotically normal with +variance \\(Var[f(Z)] / n\\). Practitioners of Bayesian statistics often find themselves wanting to estimate -`\\(E_p[f(Z)]\\)` when the distribution `p` is known only up to a constant. For +\\(E_p[f(Z)]\\) when the distribution `p` is known only up to a constant. For example, the joint distribution `p(z, x)` may be known, but the evidence -`\\(p(x) = \int p(z, x) dz\\)` may be intractable. In that case, a parameterized -distribution family `\\(q_\lambda(z)\\)` may be chosen, and the optimal `\\(\lambda\\)` is the -one minimizing the KL divergence between `\\(q_\lambda(z)\\)` and -`\\(p(z | x)\\)`. We only know `p(z, x)`, but that is sufficient to find `\\(\lambda\\)`. +\\(p(x) = \int p(z, x) dz\\) may be intractable. In that case, a parameterized +distribution family \\(q_\lambda(z)\\) may be chosen, and the optimal \\(\lambda\\) is the +one minimizing the KL divergence between \\(q_\lambda(z)\\) and +\\(p(z | x)\\). We only know `p(z, x)`, but that is sufficient to find \\(\lambda\\). ## Log-space evaluation and subtracting the maximum Care must be taken when the random variable lives in a high dimensional space. -For example, the naive importance sample estimate `\\(E_q[f(Z) p(Z) / q(Z)]\\)` -involves the ratio of two terms `\\(p(Z) / q(Z)\\)`, each of which must have tails -dropping off faster than `\\(O(|z|^{-(k + 1)})\\)` in order to have finite integral. +For example, the naive importance sample estimate \\(E_q[f(Z) p(Z) / q(Z)]\\) +involves the ratio of two terms \\(p(Z) / q(Z)\\), each of which must have tails +dropping off faster than \\(O(|z|^{-(k + 1)})\\) in order to have finite integral. This ratio would often be zero or infinity up to numerical precision. For that reason, we write -``` $$Log E_q[ f(Z) p(Z) / q(Z) ]$$ $$ = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$ where $$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$ -``` The maximum value of the exponentiated term will be 0.0, and the expectation can be evaluated in a stable manner. diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index a8d0293d136..cd07550d2ee 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1155,7 +1155,7 @@ def atrous_conv2d(value, filters, rate, padding, name=None): Returns: A `Tensor` with the same type as `value`. - Output shape with `'VALID`` padding is: + Output shape with `'VALID'` padding is: [batch, height - 2 * (filter_width - 1), width - 2 * (filter_height - 1), out_channels]. From a734919fd8fd6d74edf1e7c3abec3ee11fec83fd Mon Sep 17 00:00:00 2001 From: Jiajia Li Date: Fri, 20 Apr 2018 09:22:26 +0800 Subject: [PATCH 0485/1734] Fix the error looking for libhdfs.so, Mac OS using libhdfs.dylib (#18486) --- tensorflow/core/platform/hadoop/hadoop_file_system.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc index 9a71fbe2b78..a8cb40502c1 100644 --- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc +++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc @@ -109,6 +109,8 @@ class LibHDFS { // in the libhdfs documentation. #if defined(PLATFORM_WINDOWS) const char* kLibHdfsDso = "hdfs.dll"; +#elif defined(MACOS) || defined(TARGET_OS_MAC) + const char* kLibHdfsDso = "libhdfs.dylib"; #else const char* kLibHdfsDso = "libhdfs.so"; #endif From 256aad5324d163c028da0dc0318c3e00cf2fc3ab Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 19 Apr 2018 18:29:00 -0700 Subject: [PATCH 0486/1734] [XLA] Fix a bug in the name_uniquer. The problem happens because the name_uniquer stripped away the numeric suffix if it <=0. The solution is, if there was a numeric suffix, the result should also have a numeric suffix. PiperOrigin-RevId: 193606838 --- tensorflow/compiler/xla/service/name_uniquer.cc | 11 ++++++----- tensorflow/compiler/xla/service/name_uniquer_test.cc | 11 +++++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc index 7d8c05fffa4..f74bcb0b793 100644 --- a/tensorflow/compiler/xla/service/name_uniquer.cc +++ b/tensorflow/compiler/xla/service/name_uniquer.cc @@ -53,17 +53,18 @@ NameUniquer::NameUniquer(const string& separator) { } string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { - string root = prefix.empty() ? "name" : prefix.ToString(); - root = GetSanitizedName(root); + string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString()); // Strip away numeric suffix (if any). Only recognize separator if it is in // the middle of the name. + bool has_numeric_suffix = false; + int64 numeric_suffix = 0; size_t separator_index = root.rfind(separator_); if (separator_index != string::npos && (separator_index > 0) && (separator_index < root.size() - 1)) { string after_suffix = root.substr(separator_index + 1); - int64 numeric_suffix; if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) { + has_numeric_suffix = true; // Remove numeric suffix from root. root = root.substr(0, separator_index); // Update count to at least the numeric suffix value to avoid future @@ -71,11 +72,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { generated_names_[root] = std::max(generated_names_[root], numeric_suffix); } } - int64* count = &(generated_names_[root]); if (*count == 0) { *count = 1; - return root; + return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0) + : root; } else { tensorflow::strings::StrAppend(&root, separator_, *count); // Increment lookup under old 'root' name. diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc index 4258cf16876..2ec255558c4 100644 --- a/tensorflow/compiler/xla/service/name_uniquer_test.cc +++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc @@ -57,11 +57,18 @@ TEST_F(NameUniquerTest, NumericSuffixes) { EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo")); EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1")); EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1")); - EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000")); + EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000")); EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000")); EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1")); } +TEST_F(NameUniquerTest, PrefixHasSuffix) { + NameUniquer uniquer("."); + + EXPECT_EQ("foo.11.0", uniquer.GetUniqueName("foo.11.0")); + EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11")); +} + TEST_F(NameUniquerTest, Sanitize) { NameUniquer uniquer("_"); @@ -73,7 +80,7 @@ TEST_F(NameUniquerTest, Sanitize) { EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo")); // Invalid characters will be replaced with '_'. - EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000")); + EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000")); EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000")); EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1")); From 052c3863cf8b901303a1a32e82b6525dc6ea6dbd Mon Sep 17 00:00:00 2001 From: Anna R Date: Thu, 19 Apr 2018 18:45:47 -0700 Subject: [PATCH 0487/1734] Internal change. PiperOrigin-RevId: 193608140 --- tensorflow/compiler/xla/python/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 0b9333b406d..ecb87bd8893 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -8,7 +8,6 @@ py_library( name = "xla_client", srcs = ["xla_client.py"], srcs_version = "PY2AND3", - tags = ["no_oss"], visibility = ["//visibility:public"], deps = [ ":pywrap_xla", @@ -21,6 +20,7 @@ py_test( srcs = ["xla_client_test.py"], main = "xla_client_test.py", srcs_version = "PY2AND3", + tags = ["no_oss"], deps = [ ":xla_client", "//tensorflow/python:platform_test", From 6e2df5e471295cd32f9887d76e6ddbf1b4e2a11a Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 19 Apr 2018 19:03:03 -0700 Subject: [PATCH 0488/1734] Automated g4 rollback of changelist 193593761 PiperOrigin-RevId: 193609407 --- tensorflow/compiler/xla/service/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index d5d09bd8a3a..9009cbf845e 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -699,7 +699,6 @@ cc_library( "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager", "//tensorflow/core:stream_executor_no_cuda", - "//tensorflow/stream_executor:stream_executor_impl", ], ) From b001827146ff95c9e0ce5668c85d8cc2daf6b78d Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Thu, 19 Apr 2018 19:11:37 -0700 Subject: [PATCH 0489/1734] Support variable parameter structure in TPU distribution strategy. TPUStrategy is added to a few more tests. There appears to be an issue with the batch norm test in minimize_loss_test where the moving averages stay at 0. I'm trying to resolve that separately as the next CL. PiperOrigin-RevId: 193610264 --- tensorflow/contrib/distribute/python/BUILD | 18 +++-- .../distribute/python/minimize_loss_test.py | 19 ++++- .../distribute/python/single_loss_example.py | 7 +- .../contrib/distribute/python/tpu_strategy.py | 70 +++++++++++-------- .../contrib/distribute/python/values.py | 34 +++++++-- 5 files changed, 104 insertions(+), 44 deletions(-) diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index 837a1f13480..c2834d82266 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -231,15 +231,14 @@ py_library( srcs = ["tpu_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - "//tensorflow/contrib/distribute/python:one_device_strategy", - "//tensorflow/contrib/eager/python:datasets", - "//tensorflow/contrib/optimizer_v2:training", + ":one_device_strategy", + ":values", "//tensorflow/contrib/tpu", - "//tensorflow/python:array_ops", + "//tensorflow/contrib/tpu:tpu_py", + "//tensorflow/python:constant_op", + "//tensorflow/python:control_flow_ops", "//tensorflow/python:framework_ops", - "//tensorflow/python:math_ops", - "//tensorflow/python/eager:context", - "@six_archive//:six", + "//tensorflow/python:util", ], ) @@ -249,9 +248,13 @@ py_library( srcs = ["minimize_loss_test.py"], deps = [ ":combinations", + ":mirrored_strategy", ":single_loss_example", + "//tensorflow/contrib/tpu:tpu_lib", "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/data/ops:dataset_ops", "//tensorflow/python/eager:context", @@ -324,6 +327,7 @@ py_library( srcs = ["single_loss_example.py"], deps = [ ":step_fn", + "//tensorflow/contrib/data/python/ops:batching", "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", "//tensorflow/python:layers", diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index 43b2e91cbf1..e134fe34e10 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -96,8 +96,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.times( combinations.distributions_and_v1_optimizers() + combinations.distributions_and_v2_optimizers(), - combinations.combine(mode=["graph", "eager"]))) - def testOptimizerInsideModelFn(self, distribution, optimizer_fn): + combinations.combine(mode=["graph", "eager"], is_tpu=[False])) + + combinations.combine( + distribution=[combinations.tpu_strategy], + optimizer_fn=[ + combinations.adam_optimizer_v1_fn, + combinations.gradient_descent_optimizer_v1_fn + ], + mode=["graph"], + is_tpu=[True])) + + def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): created_variables = [] trainable_variables = [] @@ -128,11 +137,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): if not context.executing_eagerly(): with self.test_session() as sess: + if is_tpu: + sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() + if is_tpu: + with self.test_session() as sess: + sess.run(tpu.shutdown_system()) + def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py index abd13c6cc69..0db0b59fcac 100644 --- a/tensorflow/contrib/distribute/python/single_loss_example.py +++ b/tensorflow/contrib/distribute/python/single_loss_example.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.contrib.data.python.ops import batching from tensorflow.contrib.distribute.python import step_fn from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import constant_op @@ -54,7 +55,11 @@ def minimize_loss_example(optimizer_fn, """Example of non-distribution-aware legacy code.""" def dataset_fn(): - return dataset_ops.Dataset.from_tensors([[1.]]).repeat().batch(2) + dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() + # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be + # fully defined for TPU. Remove this when XLA supports dynamic shapes. + return dataset.apply( + batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True)) # An Optimizer instance is created either outside or inside model_fn. outer_optimizer = None diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index ceb52ceca72..a7e4fe80f3e 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -21,15 +21,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + from tensorflow.contrib import tpu from tensorflow.contrib.distribute.python import one_device_strategy from tensorflow.contrib.distribute.python import values from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.util import nest # TODO(isaprykin): Consider whether inheriting is really appropriate. @@ -37,48 +38,53 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): """Experimental TPU distribution strategy implementation.""" def __init__(self, - global_batch_size=2, num_cores_per_host=2, iterations_per_step=2): - # TODO(isaprykin): Generalize the defaults. + # TODO(isaprykin): Generalize the defaults. They are currently tailored for + # the unit test. super(TPUStrategy, self).__init__('/cpu:0') # TODO(isaprykin): Auto-detect number of cores and hosts. self._num_cores_per_host = num_cores_per_host - self._global_batch_size = global_batch_size # TODO(isaprykin): This might have to be per-call. self._iterations_per_step = iterations_per_step def distribute_dataset(self, dataset_fn): return values.PerIterationDataset( - self._call_dataset_fn(dataset_fn), self._iterations_per_step) + self._call_dataset_fn(dataset_fn), self._iterations_per_step, + self._num_cores_per_host) def _call_for_each_tower(self, fn, *args, **kwargs): kwargs.pop('run_concurrently', None) - # TODO(isaprykin): Support variable arguments similar to PerDevice+regroup. - inputs = args[0] + inputs = {'args': args, 'kwargs': kwargs} + flat_inputs = nest.flatten(inputs) - sharded_shape = [None] # Python 2 nonlocal. + feed_mask = [isinstance(f, values.PerIteration) for f in flat_inputs] + + feeds = lambda: itertools.compress(flat_inputs, feed_mask) + shapes = [f.get_shape() for f in feeds()] + if any([not s.is_fully_defined() for s in shapes]): + raise ValueError( + 'TPU currently requires fully defined shapes. Either use ' + 'set_shape() on the input tensors or use ' + 'dataset.apply(map_and_batch(..., drop_remainder=True)).') + types = [f.get_dtype() for f in feeds()] def infeed_input(i): """Get input, split it and then enqueue.""" - batches = array_ops.gather(inputs, i) + iteration_inputs = [f.get(i) for f in feeds()] - # TODO(isaprykin): Handle partial batch. - global_shape = [self._global_batch_size] + list(batches.get_shape())[1:] - sharded_shape[0] = ([self._global_batch_size / self._num_cores_per_host] + - list(global_shape)[1:]) + infeed_inputs = [[inputs_per_core[core_id] + for inputs_per_core in iteration_inputs] + for core_id in range(self._num_cores_per_host)] - batches.set_shape(global_shape) - batches = array_ops.split(batches, self._num_cores_per_host) + infeed_ops = [] + for core_id, infeed_input in enumerate(infeed_inputs): + infeed_ops.append( + tpu_ops.infeed_enqueue_tuple( + inputs=infeed_input, shapes=shapes, device_ordinal=core_id)) - infeeds = [ - tpu_ops.infeed_enqueue_tuple( - inputs=[batches[j]], shapes=[sharded_shape[0]], device_ordinal=j) - for j in range(self._num_cores_per_host) - ] - - with ops.control_dependencies(infeeds): + with ops.control_dependencies(infeed_ops): return i + 1 with ops.device('/task:0/device:CPU:0'): @@ -87,13 +93,21 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): infeed_input, [constant_op.constant(0)], parallel_iterations=1) - assert sharded_shape[0] - def dequeueing_fn(*args, **kwargs): + """Dequeue input arguments and supply them to `fn`.""" del args, kwargs - x, = tpu.infeed_dequeue_tuple( - dtypes=[dtypes.float32], shapes=[sharded_shape[0]]) - return fn(x) + dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes) + dequeued = iter(dequeued) + + fn_inputs = [] + for inp, is_feed in zip(flat_inputs, feed_mask): + if is_feed: + fn_inputs.append(next(dequeued)) + else: + fn_inputs.append(inp) + + fn_inputs = nest.pack_sequence_as(inputs, fn_inputs) + return fn(*fn_inputs['args'], **fn_inputs['kwargs']) def iterate_on_tpu(): return tpu.repeat(self._iterations_per_step, dequeueing_fn, []) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 62016c3a789..8cb5276579f 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -570,18 +570,36 @@ class PerDeviceDataset(object): dataset_iterator, self._devices, self._prefetch_on_device) +class PerIteration(object): + """Holds input for multiple iterations at once.""" + + def __init__(self, index): + self._index = index + + def get(self, iteration): + return array_ops.gather(self._index, iteration) + + def get_shape(self): + return self._index[-1][-1].get_shape() + + def get_dtype(self): + return self._index[-1][-1].dtype + + class MultiIterator(object): """Iterator that returns results of multiple get_next()s.""" - def __init__(self, dataset_iterator, iterations): + def __init__(self, dataset_iterator, iterations, batches_per_iteration): self._dataset_iterator = dataset_iterator self._iterations = iterations + self._batches_per_iteration = batches_per_iteration def get_next(self, name=None): - return [ + return PerIteration([[ self._dataset_iterator.get_next(name=name) - for _ in range(self._iterations) + for _ in range(self._batches_per_iteration) ] + for _ in range(self._iterations)]) @property def initializer(self): @@ -589,18 +607,22 @@ class MultiIterator(object): class PerIterationDataset(object): + """A dataset that returns MultiIterators.""" - def __init__(self, dataset, iterations): + def __init__(self, dataset, iterations, batches_per_iteration): self._dataset = dataset self._iterations = iterations + self._batches_per_iteration = batches_per_iteration def make_one_shot_iterator(self): iterator = self._dataset.make_one_shot_iterator() - return MultiIterator(iterator, self._iterations) + return MultiIterator(iterator, self._iterations, + self._batches_per_iteration) def make_initializable_iterator(self): iterator = self._dataset.make_initializable_iterator() - return MultiIterator(iterator, self._iterations) + return MultiIterator(iterator, self._iterations, + self._batches_per_iteration) class MapOutput(object): From 8723770b4cbcac0a528354d8508a5ef83716d1fa Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Thu, 19 Apr 2018 19:27:34 -0700 Subject: [PATCH 0490/1734] [XLA] Remove default argument on virtual function DeviceMemoryAllocator::Allocate(). Default args on virtual functions are disallowed by the Google style guide, for good reason. They have the extremely surprising behavior that the defaults you get when calling a function on a pointer depend not on the underlying type of the object, but on whatever is the semantic type of the pointer! PiperOrigin-RevId: 193611213 --- .../xla/service/device_memory_allocator.h | 30 ++++++++++++++----- .../xla/tests/local_client_test_base.cc | 3 +- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index 240acf89739..da45c4d45a1 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -38,13 +38,25 @@ class DeviceMemoryAllocator { virtual ~DeviceMemoryAllocator() {} // 'retry_on_failure': If false, and the first attempt to allocate the memory - // fails, the allocation should return immediately without retrying. - // An example use case is optional scratch spaces where a failure - // has only performance impact. + // fails, the allocation should return immediately without retrying. An + // example use case is optional scratch spaces where a failure has only + // performance impact. + // // Allocate() should return a null pointer for a size-0 allocation. // Deallocate() must be a no-op for null pointers. - virtual StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) = 0; + virtual StatusOr Allocate(int device_ordinal, + uint64 size, + bool retry_on_failure) = 0; + + // Two-arg version of Allocate(), which sets retry-on-failure to true. + // + // (We don't simply use a default argument on the virtual Allocate function + // because default args on virtual functions are disallowed by the Google + // style guide.) + StatusOr Allocate(int device_ordinal, uint64 size) { + return Allocate(device_ordinal, size, /*retry_on_failure=*/true); + } + virtual tensorflow::Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) = 0; @@ -67,8 +79,12 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { const se::Platform* platform, tensorflow::gtl::ArraySlice stream_executors); - StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + + // Pull in two-arg overload that sets retry_on_failure to true. + using DeviceMemoryAllocator::Allocate; + tensorflow::Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override; diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index c60ba2422f4..bb5aabb214d 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -44,7 +44,8 @@ StatusOr TestAllocator::Allocate(int device_ordinal, allocation_count_++; device_allocation_count_[device_ordinal]++; } - return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size); + return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size, + retry_on_failure); } tensorflow::Status TestAllocator::Deallocate(int device_ordinal, From 2a956c9b8f9950405b481ccc0e05636873ecc9ae Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 20 Apr 2018 02:40:37 +0000 Subject: [PATCH 0491/1734] Support string tensors for tf.count_nonzero This fix tries to address the issue raised in 18712 where `tf.count_nonzero` does not support string tensors. The implementation of `tf.count_nonzero` relies on `tf.not_equal` which actually support string tensors. The reason the string tensor does not work is because `tf.count_nonzero` created a numpy type `zero` which uses `input_tensor.dtype.as_numpy_dtype()`. The numpy type `zero` is then passed to `tf.not_equal (which converts numpy `zero` into a tensor zero). However, `input_tensor.dtype.as_numpy_dtype()` will converts tf.string to numpy.object thus the exception. But that is not necessary as `zero` could be created with `tf.zeros` directly without back and forth conversion to numpy. This fix fixes the issue. This fix fixes 18712. Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 781b1c557f3..8c9ad66b0e2 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1487,7 +1487,8 @@ def count_nonzero(input_tensor, with ops.name_scope(name, "count_nonzero", [input_tensor]): input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor") - zero = input_tensor.dtype.as_numpy_dtype() + # A scalar of 'zero' is enough as `not_equal` will broadcast. + zero = array_ops.zeros([], dtype=input_tensor.dtype) return cast( reduce_sum( # int64 reduction happens on GPU From 37999ce500f27d587100f0bf45e87957936f5ada Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 20 Apr 2018 02:48:15 +0000 Subject: [PATCH 0492/1734] Add test case for tf.string support with tf.count_nonzero Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/reduction_ops_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py index 589ea54973c..0be89e1ff4e 100644 --- a/tensorflow/python/kernel_tests/reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/reduction_ops_test.py @@ -958,6 +958,12 @@ class CountNonzeroReductionTest(test.TestCase): y = math_ops.count_nonzero(x, [0]) self.assertAllEqual(y.eval(), np.zeros(9938)) + def testStringReduce(self): + # Test case for GitHub issue 18712 + with self.test_session() as sess: + v = math_ops.count_nonzero(constant_op.constant(["test"])) + self.assertAllClose(sess.run(v), 1) + if __name__ == "__main__": test.main() From 7358025743951b42fe0f99fb85b4418769de5357 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 20 Apr 2018 02:51:54 +0000 Subject: [PATCH 0493/1734] Add test cases with axis and keepdims for tf.count_nonzero and string Signed-off-by: Yong Tang --- .../python/kernel_tests/reduction_ops_test.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py index 0be89e1ff4e..943b80b787d 100644 --- a/tensorflow/python/kernel_tests/reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/reduction_ops_test.py @@ -889,9 +889,9 @@ class AnyReductionTest(test.TestCase): class CountNonzeroReductionTest(test.TestCase): - def _compare(self, x, reduction_axes, keepdims, use_gpu=False, + def _compare(self, x, reduction_axes, keepdims, use_gpu=False, zero=0, feed_dict=None): - np_ans = (x != 0).astype(np.int32) + np_ans = (x != zero).astype(np.int32) if reduction_axes is None: np_ans = np.sum(np_ans, keepdims=keepdims) else: @@ -964,6 +964,15 @@ class CountNonzeroReductionTest(test.TestCase): v = math_ops.count_nonzero(constant_op.constant(["test"])) self.assertAllClose(sess.run(v), 1) + def testStringReduce1D(self): + # Create a 1D array of strings + x = np.asarray(["", "", "a", "", "", "b"]) + self._compare(x, None, keepdims=False, zero=np.str("")) + self._compare(x, [], keepdims=False, zero=np.str("")) + self._compare(x, [0], keepdims=False, zero=np.str("")) + self._compare(x, None, keepdims=True, zero=np.str("")) + self._compare(x, [], keepdims=True, zero=np.str("")) + self._compare(x, [0], keepdims=True, zero=np.str("")) if __name__ == "__main__": test.main() From 01ab85f0fdce13f98b705c54901284a165ed7bd8 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 20 Apr 2018 02:53:57 +0000 Subject: [PATCH 0494/1734] Add n-D test cases for better coverage Signed-off-by: Yong Tang --- .../python/kernel_tests/reduction_ops_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py index 943b80b787d..ea78b58d88f 100644 --- a/tensorflow/python/kernel_tests/reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/reduction_ops_test.py @@ -974,5 +974,21 @@ class CountNonzeroReductionTest(test.TestCase): self._compare(x, [], keepdims=True, zero=np.str("")) self._compare(x, [0], keepdims=True, zero=np.str("")) + def testStringReduce2D(self): + # Create a 2D array of strings + x = np.asarray([["", "", "a", "", "", "b"], + ["", "c", "", "d", "", ""], + ["e", "", "f", "", "", ""]]) + self._compare(x, None, keepdims=False, zero=np.str("")) + self._compare(x, [], keepdims=False, zero=np.str("")) + self._compare(x, [0], keepdims=False, zero=np.str("")) + self._compare(x, [1], keepdims=False, zero=np.str("")) + self._compare(x, [0, 1], keepdims=False, zero=np.str("")) + self._compare(x, None, keepdims=True, zero=np.str("")) + self._compare(x, [], keepdims=True, zero=np.str("")) + self._compare(x, [0], keepdims=True, zero=np.str("")) + self._compare(x, [0, 1], keepdims=True, zero=np.str("")) + + if __name__ == "__main__": test.main() From 38dcc57681612c2321169367c8756bb218472dd7 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Thu, 19 Apr 2018 19:56:09 -0700 Subject: [PATCH 0495/1734] Revert part of tensorflow/core/grappler/optimizers/meta_optimizer.cc from #18479. --- .../grappler/optimizers/meta_optimizer.cc | 22 +------------------ 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index bca779c3b32..22799311bcd 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -168,26 +168,6 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers) : InitializeOptimizers(&optimizers)); - // Append custom configurable optimizers. - std::vector - custom_configurable_optimizers; - for (const auto& optimizer : cfg_.custom_optimizers()) { - if (available_optimizers.find(optimizer.name()) != - available_optimizers.end()) { - optimizers.push_back(NewOptimizer(optimizer.name())); - } else { - custom_configurable_optimizers.push_back(optimizer); - } - } - // Now initialize and configure the custom optimizers. - for (const auto& optimizer : custom_configurable_optimizers) { - std::unique_ptr opt = - CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer.name()); - if (opt == nullptr) continue; - TF_RETURN_IF_ERROR(opt->Init(&optimizer)); - optimizers.push_back(std::move(opt)); - } - if (optimizers.empty()) { *optimized_graph = item.graph; return Status::OK(); @@ -341,7 +321,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) { cfg.auto_parallel().enable() || cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT || cfg.debug_stripper() == RewriterConfig::ON || - !cfg.optimizers().empty() || !cfg.custom_optimizers().empty(); + !cfg.optimizers().empty(); } Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg, From 4ef9de422d452683ac661d3a6313aeb2972b836d Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Thu, 19 Apr 2018 20:00:21 -0700 Subject: [PATCH 0496/1734] Always include the local worker in the list of filtered targets. It is currently legal to specify a device filter that doesn't include the local worker. In that case, the MasterSession includes all local devices regardless of the filter. This change extends this behavior to the list of filtered workers, which will be crucial for backwards compatibility when we enable CreateWorkerSession for all MasterSessions, because we need to call CreateWorkerSession on all potential workers. PiperOrigin-RevId: 193613313 --- tensorflow/core/distributed_runtime/master.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc index 288656e7f80..e60386fd34a 100644 --- a/tensorflow/core/distributed_runtime/master.cc +++ b/tensorflow/core/distributed_runtime/master.cc @@ -167,13 +167,16 @@ class DeviceFinder { } // Enumerates all known workers' target. A target name is a // prefix of a device name. E.g., /job:mnist/replica:0/task:10. + CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided."; + const string& local_device_name = env_->local_devices[0]->name(); std::vector workers; worker_cache->ListWorkers(&workers); if (filters_.empty()) { std::swap(workers, targets_); } else { for (const string& name : workers) { - if (MatchFilters(name)) { + if (MatchFilters(name) || + DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) { targets_.push_back(name); } } From ddd763de08c5095d9a0dbb8acceb82135c0aa485 Mon Sep 17 00:00:00 2001 From: imsheridan Date: Fri, 20 Apr 2018 11:08:34 +0800 Subject: [PATCH 0497/1734] Fix unwanted typo caused protobuf load failure --- tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt index 743247bb60c..ad0aeac0042 100644 --- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt @@ -80,4 +80,5 @@ $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ +END } From 7f3baa210a45cd0b41e21b63c2be6dd54230ea0b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 20 Apr 2018 02:55:31 +0000 Subject: [PATCH 0498/1734] Update doc string for tf.count_nonzero to add string type Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 8c9ad66b0e2..31ce83905b0 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1467,7 +1467,8 @@ def count_nonzero(input_tensor, ``` Args: - input_tensor: The tensor to reduce. Should be of numeric type, or `bool`. + input_tensor: The tensor to reduce. Should be of numeric type, `string`, + or `bool`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. From 2273c4e56334caf31de01c6b6f8f4edd48432972 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Thu, 19 Apr 2018 21:33:41 -0700 Subject: [PATCH 0499/1734] Skip tests with no_oss tag in XLA builds. PiperOrigin-RevId: 193619344 --- tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh index a94a627dfb6..a410c10b61b 100755 --- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh +++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh @@ -35,7 +35,7 @@ echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc bazel clean # Run bazel test command. Double test timeouts to avoid flakes. -bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \ +bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \ --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \ --build_tests_only --test_output=errors --local_test_jobs=8 \ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ From 06bb3364795e443206910c98cee132d719cf41e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?= Date: Fri, 20 Apr 2018 13:33:05 +0800 Subject: [PATCH 0500/1734] TST: byte string for python3 --- .../python/kernel_tests/scatter_nd_ops_test.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py index dfe9600dbb2..b7477a768ab 100644 --- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py +++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py @@ -365,31 +365,35 @@ class ScatterNdTest(test.TestCase): return array_ops.scatter_nd(indices, updates, shape) def testString(self): - indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32) + indices = constant_op.constant([[4], [3], [1], [7]], + dtype=dtypes.int32) updates = constant_op.constant(["four", "three", "one", "seven"], dtype=dtypes.string) - expected = np.array(["", "one", "", "three", "four", "", "", "seven"]) + expected = np.array([b"", b"one", b"", b"three", b"four", + b"", b"", b"seven"]) scatter = self.scatter_nd(indices, updates, shape=(8,)) with self.test_session() as sess: result = sess.run(scatter) self.assertAllEqual(expected, result) # Same indice is updated twice by same value. - indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32) + indices = constant_op.constant([[4], [3], [3], [7]], + dtype=dtypes.int32) updates = constant_op.constant(["a", "b", "b", "c"], dtype=dtypes.string) - expected = np.array(["", "", "", "bb", "a", "", "", "c"]) + expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"]) scatter = self.scatter_nd(indices, updates, shape=(8,)) with self.test_session() as sess: result = sess.run(scatter) self.assertAllEqual(expected, result) # Same indice is updated twice by different value. - indices = constant_op.constant([[4], [3], [3], [7]], dtype=dtypes.int32) + indices = constant_op.constant([[4], [3], [3], [7]], + dtype=dtypes.int32) updates = constant_op.constant(["a", "b", "c", "d"], dtype=dtypes.string) - expected = [np.array(["", "", "", "bc", "a", "", "", "d"]), - np.array(["", "", "", "cb", "a", "", "", "d"])] + expected = [np.array([b"", b"", b"", b"bc", b"a", b"", b"", b"d"]), + np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])] scatter = self.scatter_nd(indices, updates, shape=(8,)) with self.test_session() as sess: result = sess.run(scatter) From 70b8d21edcc84818835c9e2940a5df288c309d45 Mon Sep 17 00:00:00 2001 From: Roy Frostig Date: Thu, 19 Apr 2018 23:01:07 -0700 Subject: [PATCH 0501/1734] [XLA] Rework the local XLA client's Shape class with separate array and tuple shape constructors. PiperOrigin-RevId: 193624591 --- .../compiler/xla/python/numpy_bridge.cc | 20 +-- tensorflow/compiler/xla/python/xla_client.py | 137 ++++++++++++------ .../compiler/xla/python/xla_client_test.py | 10 +- 3 files changed, 103 insertions(+), 64 deletions(-) diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index eec48479c92..dc6f5fe5fcc 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -181,16 +181,6 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { PyObjectCppRepr(o).c_str()); }; - auto get_attr = [o, &error](const string& field) -> StatusOr { - PyObject* result = - PyObject_GetAttrString(o, const_cast(field.c_str())); - if (result == nullptr) { - return error(tensorflow::strings::StrCat( - "Failed to get attribute of Shape object:", field)); - } - return result; - }; - auto call_method = [o, &error](const string& method) -> StatusOr { PyObject* result = PyObject_CallMethod(o, const_cast(method.c_str()), nullptr); @@ -202,12 +192,16 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { }; PyObject* np_type; - TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype")); + TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype")); if (np_type->ob_type != &PyArrayDescr_Type) { - return error("Shape attribute np_dtype is not an integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not an integer numpy dtype"); } if (!NumpyTypeIsValid(NumpyTypenum(np_type))) { - return error("Shape attribute np_dtype is not a valid integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not a valid integer numpy dtype"); } const PrimitiveType element_type = NumpyTypeToPrimitiveType(NumpyTypenum(np_type)); diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 9c81f6439d0..f6809b6b871 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -166,14 +166,14 @@ class LocalBuffer(object): self._delete = c_api.DeleteLocalShapedBuffer @staticmethod - def from_py(npval, layout_fn=None): - npval = require_numpy_array_layout(npval) + def from_pyval(pyval, layout_fn=None): + pyval = require_numpy_array_layout(pyval) if layout_fn: - shape = Shape.from_numpy(npval) + shape = Shape.from_pyval(pyval) shape = shape.map_leaves(layout_fn) else: shape = None - return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape)) + return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape)) def to_py(self): return self.c_local_shaped_buffer.ToLiteral() @@ -191,53 +191,104 @@ class LocalBuffer(object): class Shape(object): - """XLA shape. + """Represents an XLA shape. - Represents an XLA shape by a corresponding Python/Numpy type and a - list of dimensions, which are themselves Shapes in case this one - represents an XLA tuple. + A shape is either an array shape, having rank-many integer + dimensions and an element type (represented by a Numpy dtype), or it + is a tuple shape, having a shape for every tuple component: + + type shape = + TupleShape of shape list + | ArrayShape of { dimensions: int list; element_type: dtype } + + Callers are expected to instantiate this class only via the static + constructors: tuple_shape, array_shape, and from_pyval. """ - def __init__(self, np_dtype, dimensions, minor_to_major=None): + @staticmethod + def tuple_shape(tuple_shapes): + """Construct a tuple shape.""" + if (not isinstance(tuple_shapes, (tuple, list)) or + not all(isinstance(t, Shape) for t in tuple_shapes)): + raise TypeError('tuple_shapes must be a tuple of Shapes') + return Shape(tuple_shapes, tuple) + + @staticmethod + def array_shape(element_type, dimensions, minor_to_major=None): + """Construct an array shape.""" + if (not isinstance(dimensions, tuple) or + not all(isinstance(i, int) for i in dimensions)): + dimensions = tuple(int(i) for i in dimensions) + return Shape(dimensions, np.dtype(element_type), + minor_to_major=minor_to_major) + + @staticmethod + def from_pyval(pyval): + def convert(pyval): + if isinstance(pyval, tuple): + return Shape.tuple_shape(tuple(convert(elt) for elt in pyval)) + else: + pyval = require_numpy_array_layout(pyval) + return Shape.array_shape(pyval.dtype, np.shape(pyval)) + return convert(pyval) + + def __init__(self, dimensions, dtype, minor_to_major=None): assert isinstance(dimensions, tuple) - self.np_dtype = np_dtype self._dimensions = dimensions + self._dtype = dtype + self._is_tuple = dtype == tuple self._minor_to_major = minor_to_major self._check_minor_to_major() def __eq__(self, other): # pylint: disable=protected-access - return (self.np_dtype == other.np_dtype and + return (self._dtype == other._dtype and self._dimensions == other._dimensions and self._minor_to_major == other._minor_to_major) def __repr__(self): - return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, ' - 'minor_to_major={!r})').format(self.np_dtype, self._dimensions, - self._minor_to_major) - - def element_type(self): - return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)] + return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, ' + '_is_tuple={!r}), _minor_to_major={!r}').format( + self._dtype, self._dimensions, self._is_tuple, + self._minor_to_major) def is_tuple(self): - return self.element_type() == xla_data_pb2.TUPLE + return self._is_tuple - def dimensions(self): - if self.is_tuple(): - raise ValueError('Tuple shape has no dimensions') - return self._dimensions - - def minor_to_major(self): - return self._minor_to_major + def is_array(self): + return not self._is_tuple def tuple_shapes(self): if not self.is_tuple(): - raise ValueError('Shape is not a tuple shape') + raise ValueError('not a tuple shape') + return self._dimensions + + def numpy_dtype(self): + """Like element_type(), but returns dtype('O') in case of a tuple shape.""" + if self.is_tuple(): + return np.dtype(np.object) + else: + return self.element_type() + + def xla_element_type(self): + return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())] + + def element_type(self): + if not self.is_array(): + raise ValueError('not an array shape') + return self._dtype + + def dimensions(self): + if not self.is_array(): + raise ValueError('not an array shape') return self._dimensions def rank(self): return len(self.dimensions()) + def minor_to_major(self): + return self._minor_to_major + def map_leaves(self, f): """Map f over each leaf-level array subshape. @@ -250,7 +301,7 @@ class Shape(object): """ if self.is_tuple(): children = tuple(child.map_leaves(f) for child in self.tuple_shapes()) - return Shape(np.dtype('O'), children) + return Shape.tuple_shape(children) else: mapped = f(self) return self if mapped is None else mapped @@ -264,30 +315,24 @@ class Shape(object): assert sorted(mtm) == range(len(mtm)), self def update_minor_to_major(self, minor_to_major): + if not self.is_array(): + raise ValueError('not an array shape') if not isinstance(minor_to_major, tuple): raise TypeError('minor_to_major must be a tuple') - updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major) + updated = Shape.array_shape( + self.element_type(), self.dimensions(), minor_to_major) updated._check_minor_to_major() # pylint: disable=protected-access return updated - @staticmethod - def from_numpy(npval): - - def convert(npval): - if isinstance(npval, tuple): - return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval)) - else: - return Shape(npval.dtype, np.shape(npval)) - - return convert(require_numpy_array_layout(npval)) - def _wrap_shape(shape_info): dtype, dims = shape_info element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)] if element_type == xla_data_pb2.TUPLE: - dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims) - return Shape(dtype, dims) + shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims) + return Shape.tuple_shape(shapes) + else: + return Shape.array_shape(dtype, dims) def _wrap_data_handle(handle): @@ -420,7 +465,7 @@ class LocalComputation(object): compile_options=None, layout_fn=None): return self.Compile( - argument_shapes=[Shape.from_numpy(arg) for arg in arguments], + argument_shapes=[Shape.from_pyval(arg) for arg in arguments], compile_options=compile_options, layout_fn=layout_fn) @@ -428,7 +473,7 @@ class LocalComputation(object): """Execute with Python values as arguments and return value.""" if not self.is_compiled: raise ValueError('Cannot execute an uncompiled local XLA computation.') - argument_shapes = [Shape.from_numpy(arg) for arg in arguments] + argument_shapes = [Shape.from_pyval(arg) for arg in arguments] if layout_fn: argument_shapes = [ shape.map_leaves(layout_fn) for shape in argument_shapes @@ -607,7 +652,7 @@ class ComputationBuilder(object): A ComputationDataHandle message. """ return self.ParameterWithShape( - Shape.from_numpy(value), name=name, parameter_num=parameter_num) + Shape.from_pyval(value), name=name, parameter_num=parameter_num) def Broadcast(self, operand, sizes): """Enqueues a broadcast operation onto the computation. @@ -968,7 +1013,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of F32 values. """ - shape = Shape(self.GetShape(mu).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(mu).element_type(), dims) return _wrap_data_handle( self._client.RngNormal( _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape)) @@ -988,7 +1033,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of values with the same numeric type (F32, S32, or U32) as the arguments a and b. """ - shape = Shape(self.GetShape(a).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(a).element_type(), dims) return _wrap_data_handle( self._client.RngUniform( _unwrap_data_handle(a), _unwrap_data_handle(b), shape)) diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py index d97264ea640..6fe7b242e42 100644 --- a/tensorflow/compiler/xla/python/xla_client_test.py +++ b/tensorflow/compiler/xla/python/xla_client_test.py @@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest): def _Execute(self, c, arguments): compiled_c = c.Build().CompileWithExampleArguments(arguments) - arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments] + arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments] result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers) return result_buffer.to_py() @@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest): c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14)) arg = NumpyArrayF32(1.11) compiled_c = c.Build().CompileWithExampleArguments([arg]) - arg_buffer = xla_client.LocalBuffer.from_py(arg) + arg_buffer = xla_client.LocalBuffer.from_pyval(arg) arg_buffer.delete() with self.assertRaises(ValueError): compiled_c.ExecuteWithLocalBuffers([arg_buffer]) @@ -1288,7 +1288,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedS32Values(self): to_infeed = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - c.Infeed(xla_client.Shape.from_numpy(to_infeed[0])) + c.Infeed(xla_client.Shape.from_pyval(to_infeed[0])) compiled_c = c.Build().CompileWithExampleArguments() for item in to_infeed: xla_client.transfer_to_infeed(item) @@ -1300,7 +1300,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedThenOutfeedS32(self): to_round_trip = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0])) + x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0])) c.Outfeed(x) compiled_c = c.Build().CompileWithExampleArguments() @@ -1310,7 +1310,7 @@ class EmbeddedComputationsTest(LocalComputationTest): execution.start() xla_client.transfer_to_infeed(want) got = xla_client.transfer_from_outfeed( - xla_client.Shape.from_numpy(to_round_trip[0])) + xla_client.Shape.from_pyval(to_round_trip[0])) execution.join() self.assertEqual(want, got) From f7e8fbb28a0fa4e979a94d7b458706abf48f7deb Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Thu, 19 Apr 2018 23:08:53 -0700 Subject: [PATCH 0502/1734] Automated g4 rollback of changelist 193602050 PiperOrigin-RevId: 193625346 --- tensorflow/core/lib/io/record_reader.cc | 147 ++++---------- tensorflow/core/lib/io/record_reader.h | 16 +- tensorflow/core/lib/io/recordio_test.cc | 216 ++++++++++++++------- tensorflow/core/lib/io/zlib_inputstream.cc | 16 +- tensorflow/core/lib/io/zlib_inputstream.h | 19 +- 5 files changed, 222 insertions(+), 192 deletions(-) diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc index 6de850bb207..c24628be570 100644 --- a/tensorflow/core/lib/io/record_reader.cc +++ b/tensorflow/core/lib/io/record_reader.cc @@ -56,110 +56,55 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions( RecordReader::RecordReader(RandomAccessFile* file, const RecordReaderOptions& options) - : src_(file), options_(options) { + : options_(options), + input_stream_(new RandomAccessInputStream(file)), + last_read_failed_(false) { if (options.buffer_size > 0) { - input_stream_.reset(new BufferedInputStream(file, options.buffer_size)); - } else { - input_stream_.reset(new RandomAccessInputStream(file)); + input_stream_.reset(new BufferedInputStream(input_stream_.release(), + options.buffer_size, true)); } if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) { // We don't have zlib available on all embedded platforms, so fail. #if defined(IS_SLIM_BUILD) LOG(FATAL) << "Zlib compression is unsupported on mobile platforms."; #else // IS_SLIM_BUILD - zlib_input_stream_.reset(new ZlibInputStream( - input_stream_.get(), options.zlib_options.input_buffer_size, - options.zlib_options.output_buffer_size, options.zlib_options)); + input_stream_.reset(new ZlibInputStream( + input_stream_.release(), options.zlib_options.input_buffer_size, + options.zlib_options.output_buffer_size, options.zlib_options, true)); #endif // IS_SLIM_BUILD } else if (options.compression_type == RecordReaderOptions::NONE) { // Nothing to do. } else { - LOG(FATAL) << "Unspecified compression type :" << options.compression_type; + LOG(FATAL) << "Unrecognized compression type :" << options.compression_type; } } // Read n+4 bytes from file, verify that checksum of first n bytes is // stored in the last 4 bytes and store the first n bytes in *result. -// May use *storage as backing store. -Status RecordReader::ReadChecksummed(uint64 offset, size_t n, - StringPiece* result, string* storage) { +// +// offset corresponds to the user-provided value to ReadRecord() +// and is used only in error messages. +Status RecordReader::ReadChecksummed(uint64 offset, size_t n, string* result) { if (n >= SIZE_MAX - sizeof(uint32)) { return errors::DataLoss("record size too large"); } const size_t expected = n + sizeof(uint32); - storage->resize(expected); + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, result)); -#if !defined(IS_SLIM_BUILD) - if (zlib_input_stream_) { - // If we have a zlib compressed buffer, we assume that the - // file is being read sequentially, and we use the underlying - // implementation to read the data. - // - // No checks are done to validate that the file is being read - // sequentially. At some point the zlib input buffer may support - // seeking, possibly inefficiently. - TF_RETURN_IF_ERROR(zlib_input_stream_->ReadNBytes(expected, storage)); - - if (storage->size() != expected) { - if (storage->empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - - uint32 masked_crc = core::DecodeFixed32(storage->data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(storage->data(), n); - } else { -#endif // IS_SLIM_BUILD - if (options_.buffer_size > 0) { - // If we have a buffer, we assume that the file is being read - // sequentially, and we use the underlying implementation to read the - // data. - // - // No checks are done to validate that the file is being read - // sequentially. - TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(expected, storage)); - - if (storage->size() != expected) { - if (storage->empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - - const uint32 masked_crc = core::DecodeFixed32(storage->data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(storage->data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(storage->data(), n); + if (result->size() != expected) { + if (result->empty()) { + return errors::OutOfRange("eof"); } else { - // This version supports reading from arbitrary offsets - // since we are accessing the random access file directly. - StringPiece data; - TF_RETURN_IF_ERROR(src_->Read(offset, expected, &data, &(*storage)[0])); - if (data.size() != expected) { - if (data.empty()) { - return errors::OutOfRange("eof"); - } else { - return errors::DataLoss("truncated record at ", offset); - } - } - const uint32 masked_crc = core::DecodeFixed32(data.data() + n); - if (crc32c::Unmask(masked_crc) != crc32c::Value(data.data(), n)) { - return errors::DataLoss("corrupted record at ", offset); - } - *result = StringPiece(data.data(), n); + return errors::DataLoss("truncated record at ", offset); } -#if !defined(IS_SLIM_BUILD) } -#endif // IS_SLIM_BUILD + const uint32 masked_crc = core::DecodeFixed32(result->data() + n); + if (crc32c::Unmask(masked_crc) != crc32c::Value(result->data(), n)) { + return errors::DataLoss("corrupted record at ", offset); + } + result->resize(n); return Status::OK(); } @@ -167,50 +112,42 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) { static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32); static const size_t kFooterSize = sizeof(uint32); + // Position the input stream. + int64 curr_pos = input_stream_->Tell(); + int64 desired_pos = static_cast(*offset); + if (curr_pos > desired_pos || curr_pos < 0 /* EOF */ || + (curr_pos == desired_pos && last_read_failed_)) { + last_read_failed_ = false; + TF_RETURN_IF_ERROR(input_stream_->Reset()); + TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos)); + } else if (curr_pos < desired_pos) { + TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(desired_pos - curr_pos)); + } + DCHECK_EQ(desired_pos, input_stream_->Tell()); + // Read header data. - StringPiece lbuf; - Status s = ReadChecksummed(*offset, sizeof(uint64), &lbuf, record); + Status s = ReadChecksummed(*offset, sizeof(uint64), record); if (!s.ok()) { + last_read_failed_ = true; return s; } - const uint64 length = core::DecodeFixed64(lbuf.data()); + const uint64 length = core::DecodeFixed64(record->data()); // Read data - StringPiece data; - s = ReadChecksummed(*offset + kHeaderSize, length, &data, record); + s = ReadChecksummed(*offset + kHeaderSize, length, record); if (!s.ok()) { + last_read_failed_ = true; if (errors::IsOutOfRange(s)) { s = errors::DataLoss("truncated record at ", *offset); } return s; } - if (record->data() != data.data()) { - // RandomAccessFile placed the data in some other location. - memmove(&(*record)[0], data.data(), data.size()); - } - - record->resize(data.size()); - *offset += kHeaderSize + length + kFooterSize; + DCHECK_EQ(*offset, input_stream_->Tell()); return Status::OK(); } -Status RecordReader::SkipNBytes(uint64 offset) { -#if !defined(IS_SLIM_BUILD) - if (zlib_input_stream_) { - TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset)); - } else { -#endif - if (options_.buffer_size > 0) { - TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset)); - } -#if !defined(IS_SLIM_BUILD) - } -#endif - return Status::OK(); -} // namespace io - SequentialRecordReader::SequentialRecordReader( RandomAccessFile* file, const RecordReaderOptions& options) : underlying_(file, options), offset_(0) {} diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h index 26278e03284..f6d587dfa0e 100644 --- a/tensorflow/core/lib/io/record_reader.h +++ b/tensorflow/core/lib/io/record_reader.h @@ -69,25 +69,14 @@ class RecordReader { // Read the record at "*offset" into *record and update *offset to // point to the offset of the next record. Returns OK on success, // OUT_OF_RANGE for end of file, or something else for an error. - // - // Note: if buffering is used (with or without compression), access must be - // sequential. Status ReadRecord(uint64* offset, string* record); - // Skip the records till "offset". Returns OK on success, - // OUT_OF_RANGE for end of file, or something else for an error. - Status SkipNBytes(uint64 offset); - private: - Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result, - string* storage); + Status ReadChecksummed(uint64 offset, size_t n, string* result); - RandomAccessFile* src_; RecordReaderOptions options_; std::unique_ptr input_stream_; -#if !defined(IS_SLIM_BUILD) - std::unique_ptr zlib_input_stream_; -#endif // IS_SLIM_BUILD + bool last_read_failed_; TF_DISALLOW_COPY_AND_ASSIGN(RecordReader); }; @@ -121,7 +110,6 @@ class SequentialRecordReader { return errors::InvalidArgument( "Trying to seek offset: ", offset, " which is less than the current offset: ", offset_); - TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_)); offset_ = offset; return Status::OK(); } diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc index 63235761d92..da514bd21c7 100644 --- a/tensorflow/core/lib/io/recordio_test.cc +++ b/tensorflow/core/lib/io/recordio_test.cc @@ -26,10 +26,11 @@ limitations under the License. namespace tensorflow { namespace io { +namespace { // Construct a string of the specified length made out of the supplied // partial string. -static string BigString(const string& partial_string, size_t n) { +string BigString(const string& partial_string, size_t n) { string result; while (result.size() < n) { result.append(partial_string); @@ -39,62 +40,66 @@ static string BigString(const string& partial_string, size_t n) { } // Construct a string from a number -static string NumberString(int n) { +string NumberString(int n) { char buf[50]; snprintf(buf, sizeof(buf), "%d.", n); return string(buf); } // Return a skewed potentially long string -static string RandomSkewedString(int i, random::SimplePhilox* rnd) { +string RandomSkewedString(int i, random::SimplePhilox* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } +class StringDest : public WritableFile { + public: + explicit StringDest(string* contents) : contents_(contents) {} + + Status Close() override { return Status::OK(); } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Append(const StringPiece& slice) override { + contents_->append(slice.data(), slice.size()); + return Status::OK(); + } + + private: + string* contents_; +}; + +class StringSource : public RandomAccessFile { + public: + explicit StringSource(string* contents) + : contents_(contents), force_error_(false) {} + + Status Read(uint64 offset, size_t n, StringPiece* result, + char* scratch) const override { + if (force_error_) { + force_error_ = false; + return errors::DataLoss("read error"); + } + + if (offset >= contents_->size()) { + return errors::OutOfRange("end of file"); + } + + if (contents_->size() < offset + n) { + n = contents_->size() - offset; + } + *result = StringPiece(contents_->data() + offset, n); + return Status::OK(); + } + + void force_error() { force_error_ = true; } + + private: + string* contents_; + mutable bool force_error_; +}; + class RecordioTest : public ::testing::Test { private: - class StringDest : public WritableFile { - public: - string contents_; - - Status Close() override { return Status::OK(); } - Status Flush() override { return Status::OK(); } - Status Sync() override { return Status::OK(); } - Status Append(const StringPiece& slice) override { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public RandomAccessFile { - public: - StringPiece contents_; - mutable bool force_error_; - mutable bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) {} - - Status Read(uint64 offset, size_t n, StringPiece* result, - char* scratch) const override { - EXPECT_FALSE(returned_partial_) << "must not Read() after eof/error"; - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return errors::DataLoss("read error"); - } - - if (offset >= contents_.size()) { - return errors::OutOfRange("end of file"); - } - - if (contents_.size() < offset + n) { - n = contents_.size() - offset; - returned_partial_ = true; - } - *result = StringPiece(contents_.data() + offset, n); - return Status::OK(); - } - }; - + string contents_; StringDest dest_; StringSource source_; bool reading_; @@ -104,7 +109,9 @@ class RecordioTest : public ::testing::Test { public: RecordioTest() - : reading_(false), + : dest_(&contents_), + source_(&contents_), + reading_(false), readpos_(0), writer_(new RecordWriter(&dest_)), reader_(new RecordReader(&source_)) {} @@ -119,12 +126,11 @@ class RecordioTest : public ::testing::Test { TF_ASSERT_OK(writer_->WriteRecord(StringPiece(msg))); } - size_t WrittenBytes() const { return dest_.contents_.size(); } + size_t WrittenBytes() const { return contents_.size(); } string Read() { if (!reading_) { reading_ = true; - source_.contents_ = StringPiece(dest_.contents_); } string record; Status s = reader_->ReadRecord(&readpos_, &record); @@ -137,26 +143,20 @@ class RecordioTest : public ::testing::Test { } } - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } + void IncrementByte(int offset, int delta) { contents_[offset] += delta; } - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } + void SetByte(int offset, char new_byte) { contents_[offset] = new_byte; } - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } + void ShrinkSize(int bytes) { contents_.resize(contents_.size() - bytes); } void FixChecksum(int header_offset, int len) { // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len); + uint32_t crc = crc32c::Value(&contents_[header_offset + 6], 1 + len); crc = crc32c::Mask(crc); - core::EncodeFixed32(&dest_.contents_[header_offset], crc); + core::EncodeFixed32(&contents_[header_offset], crc); } - void ForceError() { source_.force_error_ = true; } + void ForceError() { source_.force_error(); } void StartReadingAt(uint64_t initial_offset) { readpos_ = initial_offset; } @@ -165,7 +165,6 @@ class RecordioTest : public ::testing::Test { Write("bar"); Write(BigString("x", 10000)); reading_ = true; - source_.contents_ = StringPiece(dest_.contents_); uint64 offset = WrittenBytes() + offset_past_end; string record; Status s = reader_->ReadRecord(&offset, &record); @@ -217,16 +216,100 @@ TEST_F(RecordioTest, RandomRead) { ASSERT_EQ("EOF", Read()); } +void TestNonSequentialReads(const RecordWriterOptions& writer_options, + const RecordReaderOptions& reader_options) { + string contents; + StringDest dst(&contents); + RecordWriter writer(&dst, writer_options); + for (int i = 0; i < 10; ++i) { + TF_ASSERT_OK(writer.WriteRecord(NumberString(i))) << i; + } + TF_ASSERT_OK(writer.Close()); + + StringSource file(&contents); + RecordReader reader(&file, reader_options); + + string record; + // First read sequentially to fill in the offsets table. + uint64 offsets[10] = {0}; + uint64 offset = 0; + for (int i = 0; i < 10; ++i) { + offsets[i] = offset; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)) << i; + } + + // Read randomly: First go back to record #3 then forward to #8. + offset = offsets[3]; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); + EXPECT_EQ("3.", record); + EXPECT_EQ(offsets[4], offset); + + offset = offsets[8]; + TF_ASSERT_OK(reader.ReadRecord(&offset, &record)); + EXPECT_EQ("8.", record); + EXPECT_EQ(offsets[9], offset); +} + +TEST_F(RecordioTest, NonSequentialReads) { + TestNonSequentialReads(RecordWriterOptions(), RecordReaderOptions()); +} + +TEST_F(RecordioTest, NonSequentialReadsWithReadBuffer) { + RecordReaderOptions options; + options.buffer_size = 1 << 10; + TestNonSequentialReads(RecordWriterOptions(), options); +} + +TEST_F(RecordioTest, NonSequentialReadsWithCompression) { + TestNonSequentialReads( + RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), + RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); +} + // Tests of all the error paths in log_reader.cc follow: -static void AssertHasSubstr(StringPiece s, StringPiece expected) { +void AssertHasSubstr(StringPiece s, StringPiece expected) { EXPECT_TRUE(str_util::StrContains(s, expected)) << s << " does not contain " << expected; } +void TestReadError(const RecordWriterOptions& writer_options, + const RecordReaderOptions& reader_options) { + const string wrote = BigString("well hello there!", 100); + string contents; + StringDest dst(&contents); + TF_ASSERT_OK(RecordWriter(&dst, writer_options).WriteRecord(wrote)); + + StringSource file(&contents); + RecordReader reader(&file, reader_options); + + uint64 offset = 0; + string read; + file.force_error(); + Status status = reader.ReadRecord(&offset, &read); + ASSERT_TRUE(errors::IsDataLoss(status)); + ASSERT_EQ(0, offset); + + // A failed Read() shouldn't update the offset, and thus a retry shouldn't + // lose the record. + status = reader.ReadRecord(&offset, &read); + ASSERT_TRUE(status.ok()) << status; + EXPECT_GT(offset, 0); + EXPECT_EQ(wrote, read); +} + TEST_F(RecordioTest, ReadError) { - Write("foo"); - ForceError(); - AssertHasSubstr(Read(), "Data loss"); + TestReadError(RecordWriterOptions(), RecordReaderOptions()); +} + +TEST_F(RecordioTest, ReadErrorWithBuffering) { + RecordReaderOptions options; + options.buffer_size = 1 << 20; + TestReadError(RecordWriterOptions(), options); +} + +TEST_F(RecordioTest, ReadErrorWithCompression) { + TestReadError(RecordWriterOptions::CreateRecordWriterOptions("ZLIB"), + RecordReaderOptions::CreateRecordReaderOptions("ZLIB")); } TEST_F(RecordioTest, CorruptLength) { @@ -257,5 +340,6 @@ TEST_F(RecordioTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } TEST_F(RecordioTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } +} // namespace } // namespace io } // namespace tensorflow diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc index 984fbc2810c..47de36bf6c6 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.cc +++ b/tensorflow/core/lib/io/zlib_inputstream.cc @@ -25,8 +25,9 @@ ZlibInputStream::ZlibInputStream( InputStreamInterface* input_stream, size_t input_buffer_bytes, // size of z_stream.next_in buffer size_t output_buffer_bytes, // size of z_stream.next_out buffer - const ZlibCompressionOptions& zlib_options) - : input_stream_(input_stream), + const ZlibCompressionOptions& zlib_options, bool owns_input_stream) + : owns_input_stream_(owns_input_stream), + input_stream_(input_stream), input_buffer_capacity_(input_buffer_bytes), output_buffer_capacity_(output_buffer_bytes), z_stream_input_(new Bytef[input_buffer_capacity_]), @@ -37,14 +38,25 @@ ZlibInputStream::ZlibInputStream( InitZlibBuffer(); } +ZlibInputStream::ZlibInputStream(InputStreamInterface* input_stream, + size_t input_buffer_bytes, + size_t output_buffer_bytes, + const ZlibCompressionOptions& zlib_options) + : ZlibInputStream(input_stream, input_buffer_bytes, output_buffer_bytes, + zlib_options, false) {} + ZlibInputStream::~ZlibInputStream() { if (z_stream_) { inflateEnd(z_stream_.get()); } + if (owns_input_stream_) { + delete input_stream_; + } } Status ZlibInputStream::Reset() { TF_RETURN_IF_ERROR(input_stream_->Reset()); + inflateEnd(z_stream_.get()); InitZlibBuffer(); bytes_read_ = 0; return Status::OK(); diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h index 9c7e14441ce..37339163ee0 100644 --- a/tensorflow/core/lib/io/zlib_inputstream.h +++ b/tensorflow/core/lib/io/zlib_inputstream.h @@ -40,7 +40,15 @@ class ZlibInputStream : public InputStreamInterface { // Create a ZlibInputStream for `input_stream` with a buffer of size // `input_buffer_bytes` bytes for reading contents from `input_stream` and // another buffer with size `output_buffer_bytes` for caching decompressed - // contents. Does *not* take ownership of "input_stream". + // contents. + // + // Takes ownership of `input_stream` iff `owns_input_stream` is true. + ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes, + size_t output_buffer_bytes, + const ZlibCompressionOptions& zlib_options, + bool owns_input_stream); + + // Equivalent to the previous constructor with owns_input_stream=false. ZlibInputStream(InputStreamInterface* input_stream, size_t input_buffer_bytes, size_t output_buffer_bytes, const ZlibCompressionOptions& zlib_options); @@ -65,10 +73,11 @@ class ZlibInputStream : public InputStreamInterface { private: void InitZlibBuffer(); - InputStreamInterface* input_stream_; // Not owned - size_t input_buffer_capacity_; // Size of z_stream_input_ - size_t output_buffer_capacity_; // Size of z_stream_output_ - char* next_unread_byte_; // Next unread byte in z_stream_output_ + const bool owns_input_stream_; + InputStreamInterface* input_stream_; + size_t input_buffer_capacity_; // Size of z_stream_input_ + size_t output_buffer_capacity_; // Size of z_stream_output_ + char* next_unread_byte_; // Next unread byte in z_stream_output_ // Buffer for storing contents read from compressed stream. // TODO(srbs): Consider using circular buffers. That would greatly simplify From d2fd0bbac6368a6b41e73d18c93b24442f5653f1 Mon Sep 17 00:00:00 2001 From: Dimitris Vardoulakis Date: Thu, 19 Apr 2018 23:35:04 -0700 Subject: [PATCH 0503/1734] [TF:XLA] Factor out the handling of while instructions to make HloVerifier::Run shorter. PiperOrigin-RevId: 193626864 --- .../compiler/xla/service/hlo_verifier.cc | 83 +++++++++++-------- .../compiler/xla/service/hlo_verifier.h | 8 +- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 8c875698eb1..80ed6d68324 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -731,6 +731,55 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { return tensorflow::Status::OK(); } +Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { + auto* while_cond = instruction->while_condition(); + auto* while_body = instruction->while_body(); + if (while_cond->num_parameters() != 1) { + return FailedPrecondition( + "While condition must have exactly 1 parameter; had %lld : %s", + while_cond->num_parameters(), while_cond->ToString().c_str()); + } + if (while_body->num_parameters() != 1) { + return FailedPrecondition( + "While body must have exactly 1 parameter; had %lld : %s", + while_body->num_parameters(), while_body->ToString().c_str()); + } + if (instruction->operand_count() != 1) { + return FailedPrecondition( + "While loop must have exactly one operand; had %lld : %s", + instruction->operand_count(), instruction->ToString().c_str()); + } + auto* init = instruction->operand(0); + auto* cond_param = while_cond->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) { + return FailedPrecondition( + "While condition's parameter must have the same shape as the " + "loop's 'init'. init: %s, param: %s", + init->ToString().c_str(), cond_param->ToString().c_str()); + } + auto* cond_root = while_cond->root_instruction(); + if (!ShapeUtil::Compatible(cond_root->shape(), + ShapeUtil::MakeShape(PRED, {}))) { + return FailedPrecondition("While condition should have shape PRED: %s", + cond_root->ToString().c_str()); + } + auto* body_param = while_body->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) { + return FailedPrecondition( + "While body's parameter must have the same shape as the loop's" + " 'init'. init: %s, param: %s", + init->ToString().c_str(), body_param->ToString().c_str()); + } + auto* body_root = while_body->root_instruction(); + if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) { + return FailedPrecondition( + "While body should have same shape as the loop's 'init'." + "init: %s, body: %s", + init->ToString().c_str(), body_root->ToString().c_str()); + } + return tensorflow::Status::OK(); +} + StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(VerifyHloStructure(module)); @@ -771,39 +820,7 @@ StatusOr HloVerifier::Run(HloModule* module) { << instruction->dimensions().size() << " != " << ShapeUtil::Rank(instruction->operand(0)->shape()); } else if (instruction->opcode() == HloOpcode::kWhile) { - auto* while_cond = instruction->while_condition(); - auto* while_body = instruction->while_body(); - TF_RET_CHECK(while_cond->num_parameters() == 1) - << "While condition must have exactly 1 parameter; had " - << while_cond->num_parameters() << ": " << while_cond->ToString(); - TF_RET_CHECK(while_body->num_parameters() == 1) - << "While body must have exactly 1 parameter; had " - << while_body->num_parameters() << ": " << while_body->ToString(); - TF_RET_CHECK(instruction->operand_count() == 1) - << "While loop must have exactly one operand; had " - << instruction->operand_count() << ": " << instruction->ToString(); - - auto* init = instruction->operand(0); - auto* cond_param = while_cond->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape())) - << "While condition's parameter must have the same shape as the " - "loop's 'init'. init: " - << init->ToString() << ", param: " << cond_param->ToString(); - auto* cond_root = while_cond->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(), - ShapeUtil::MakeShape(PRED, {}))) - << "While condition should have shape PRED: " - << cond_root->ToString(); - - auto* body_param = while_body->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape())) - << "While body's parameter must have the same shape as the loop's " - "'init'. init: " - << init->ToString() << ", param: " << body_param->ToString(); - auto* body_root = while_body->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape())) - << "While body should have same shape as the loop's 'init'. init: " - << init->ToString() << ", body: " << body_root->ToString(); + TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction)); } auto previous = instructions.find(instruction->name()); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 1dd7ec3c51e..1ec55a9bdc9 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -102,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor { Status CheckTernaryShape(const HloInstruction* instruction); Status CheckVariadicShape(const HloInstruction* instruction); - // Checks if the given two instructions shares the same channel id. + // Checks if the given two instructions share the same channel id. Status CheckSameChannel(const HloInstruction* instr1, const HloInstruction* instr2); @@ -144,9 +144,11 @@ class HloVerifier : public HloPassInterface { // CHECKs various invariants of a fusion instruction. Status CheckFusionInstruction(HloInstruction* fusion) const; + Status CheckWhileInstruction(HloInstruction* instruction); + // Creates a ShapeVerifier that checks that shapes match inferred - // expectations. This is a factory function because ShapeVerifier, Note that - // ShapeVerifier, being a DfsHloVisitor, is stateful. We want a clean object + // expectations. This is a factory function because ShapeVerifier, + // being a DfsHloVisitor, is stateful. We want a clean object // for each run of the verifier. ShapeVerifierFactory shape_verifier_factory_; }; From 4e9dae45b3017f13eb68603294c6c28a63656050 Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan Date: Fri, 20 Apr 2018 15:35:42 +0800 Subject: [PATCH 0504/1734] change ms to us and make timestamp uint64 1. microsecond usually is denoted as us; ms is millisecond 2. make timestamp uint64 all the way --- tensorflow/contrib/lite/profiling/profile_buffer.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h index 3bfe02571ba..299b2a9cad1 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer.h +++ b/tensorflow/contrib/lite/profiling/profile_buffer.h @@ -37,9 +37,9 @@ struct ProfileEvent { // Label of the event. This usually describes the event. const char* tag; // Timestamp in microseconds when the event began. - int64_t begin_timestamp_ms; + uint64_t begin_timestamp_us; // Timestamp in microseconds when the event ended. - int64_t end_timestamp_ms; + uint64_t end_timestamp_us; // The field containing the type of event. This must be one of the event types // in EventType. EventType event_type; @@ -74,13 +74,13 @@ class ProfileBuffer { if (!enabled_) { return kInvalidEventHandle; } - int64_t timestamp = NowMicros(); + uint64_t timestamp = NowMicros(); int index = current_index_ % event_buffer_.size(); event_buffer_[index].tag = tag; event_buffer_[index].event_type = event_type; event_buffer_[index].event_metadata = event_metadata; - event_buffer_[index].begin_timestamp_ms = timestamp; - event_buffer_[index].end_timestamp_ms = 0; + event_buffer_[index].begin_timestamp_us = timestamp; + event_buffer_[index].end_timestamp_us = 0; current_index_++; return index; } @@ -103,7 +103,7 @@ class ProfileBuffer { } int event_index = event_handle % max_size; - event_buffer_[event_index].end_timestamp_ms = NowMicros(); + event_buffer_[event_index].end_timestamp_us = NowMicros(); } // Returns the size of the buffer. @@ -134,7 +134,7 @@ class ProfileBuffer { } private: - static int64_t NowMicros() { + static uint64_t NowMicros() { // TODO(shashishekhar): Refactor this to a separate file. struct timeval tv; gettimeofday(&tv, nullptr); From d3b91ba5696e998ea9155a91f58b6b6ba2afd340 Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan Date: Fri, 20 Apr 2018 17:05:22 +0800 Subject: [PATCH 0505/1734] add profiling mechanism build with something like: ``` bazel build --config android_arm64 \ --cxxopt=-std=c++11 \ --cxxopt=-DTFLITE_PROFILING_ENABLED \ //tensorflow/contrib/lite/examples/label_image:label_image ``` run `label_image` will get something like: ``` ./label_image -p 1 Loaded model ./mobilenet_quant_v1_224.tflite resolved reporter invoked average time: 67.227 ms 13.349, Node 0, OpCode 3, CONV_2D 6.024, Node 1, OpCode 4, DEPTHWISE_CONV_2D 11.847, Node 2, OpCode 3, CONV_2D 3.927, Node 3, OpCode 4, DEPTHWISE_CONV_2D 1.905, Node 4, OpCode 3, CONV_2D 3.573, Node 5, OpCode 4, DEPTHWISE_CONV_2D 2.344, Node 6, OpCode 3, CONV_2D 0.964, Node 7, OpCode 4, DEPTHWISE_CONV_2D 1.224, Node 8, OpCode 3, CONV_2D 1.846, Node 9, OpCode 4, DEPTHWISE_CONV_2D 2.181, Node 10, OpCode 3, CONV_2D 0.454, Node 11, OpCode 4, DEPTHWISE_CONV_2D 0.997, Node 12, OpCode 3, CONV_2D 0.865, Node 13, OpCode 4, DEPTHWISE_CONV_2D 1.844, Node 14, OpCode 3, CONV_2D 0.753, Node 15, OpCode 4, DEPTHWISE_CONV_2D 1.724, Node 16, OpCode 3, CONV_2D 0.803, Node 17, OpCode 4, DEPTHWISE_CONV_2D 1.698, Node 18, OpCode 3, CONV_2D 0.794, Node 19, OpCode 4, DEPTHWISE_CONV_2D 1.754, Node 20, OpCode 3, CONV_2D 0.798, Node 21, OpCode 4, DEPTHWISE_CONV_2D 1.704, Node 22, OpCode 3, CONV_2D 0.204, Node 23, OpCode 4, DEPTHWISE_CONV_2D 0.983, Node 24, OpCode 3, CONV_2D 0.373, Node 25, OpCode 4, DEPTHWISE_CONV_2D 1.791, Node 26, OpCode 3, CONV_2D 0.067, Node 27, OpCode 1, AVERAGE_POOL_2D 0.388, Node 28, OpCode 3, CONV_2D 0.001, Node 29, OpCode 22, RESHAPE 0.035, Node 30, OpCode 25, SOFTMAX 0.600: 458 bow tie 0.365: 653 military uniform 0.008: 835 suit 0.008: 611 jersey 0.004: 514 cornet ``` --- .../lite/examples/label_image/label_image.cc | 47 +++++++++++++++++-- .../lite/examples/label_image/label_image.h | 1 + 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc index a91467d345f..71d24a7ea5c 100644 --- a/tensorflow/contrib/lite/examples/label_image/label_image.cc +++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -70,6 +71,23 @@ TfLiteStatus ReadLabelsFile(const string& file_name, return kTfLiteOk; } +void PrintProfilingInfo(const profiling::ProfileEvent* e, uint32_t op_index, + TfLiteRegistration registration) { + // output something like + // time (ms) , Node xxx, OpCode xxx, symblic name + // 5.352, Node 5, OpCode 4, DEPTHWISE_CONV_2D + + + LOG(INFO) << std::fixed << std::setw(10) << std::setprecision(3) + << (e->end_timestamp_us - e->begin_timestamp_us) / 1000.0 + << ", Node " << std::setw(3) << std::setprecision(3) << op_index + << ", OpCode " << std::setw(3) << std::setprecision(3) + << registration.builtin_code << ", " + << EnumNameBuiltinOperator( + (BuiltinOperator)registration.builtin_code) + << "\n"; +} + void RunInference(Settings* s) { if (!s->model_name.c_str()) { LOG(ERROR) << "no model file name\n"; @@ -89,7 +107,7 @@ void RunInference(Settings* s) { tflite::ops::builtin::BuiltinOpResolver resolver; - tflite::InterpreterBuilder(*model, resolver)(&interpreter); + tflite::InterpreterBuilder (*model, resolver)(&interpreter); if (!interpreter) { LOG(FATAL) << "Failed to construct interpreter\n"; exit(-1); @@ -166,6 +184,11 @@ void RunInference(Settings* s) { exit(-1); } + profiling::Profiler* profiler = new profiling::Profiler(); + interpreter->SetProfiler(profiler); + + if (s->profiling) profiler->StartProfiling(); + struct timeval start_time, stop_time; gettimeofday(&start_time, NULL); for (int i = 0; i < s->loop_count; i++) { @@ -179,6 +202,18 @@ void RunInference(Settings* s) { << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000) << " ms \n"; + if (s->profiling) { + profiler->StopProfiling(); + auto profile_events = profiler->GetProfileEvents(); + for (int i = 0; i < profile_events.size(); i++) { + auto op_index = profile_events[i]->event_metadata; + const auto node_and_registration = + interpreter->node_and_registration(op_index); + const TfLiteRegistration registration = node_and_registration->second; + PrintProfilingInfo(profile_events[i], op_index, registration); + } + } + const int output_size = 1000; const size_t num_results = 5; const float threshold = 0.001f; @@ -217,13 +252,14 @@ void RunInference(Settings* s) { void display_usage() { LOG(INFO) << "label_image\n" - << "--accelerated, -a: [0|1], use Android NNAPI or note\n" + << "--accelerated, -a: [0|1], use Android NNAPI or not\n" << "--count, -c: loop interpreter->Invoke() for certain times\n" << "--input_mean, -b: input mean\n" << "--input_std, -s: input standard deviation\n" << "--image, -i: image_name.bmp\n" << "--labels, -l: labels for the model\n" << "--tflite_model, -m: model_name.tflite\n" + << "--profiling, -p: [0|1], profiling or not\n" << "--threads, -t: number of threads\n" << "--verbose, -v: [0|1] print more information\n" << "\n"; @@ -241,6 +277,7 @@ int Main(int argc, char** argv) { {"image", required_argument, 0, 'i'}, {"labels", required_argument, 0, 'l'}, {"tflite_model", required_argument, 0, 'm'}, + {"profiling", required_argument, 0, 'p'}, {"threads", required_argument, 0, 't'}, {"input_mean", required_argument, 0, 'b'}, {"input_std", required_argument, 0, 's'}, @@ -249,7 +286,7 @@ int Main(int argc, char** argv) { /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:b:c:f:i:l:m:s:t:v:", long_options, + c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:s:t:v:", long_options, &option_index); /* Detect the end of the options. */ @@ -276,6 +313,10 @@ int Main(int argc, char** argv) { case 'm': s.model_name = optarg; break; + case 'p': + s.profiling = strtol( // NOLINT(runtime/deprecated_fn) + optarg, (char**)NULL, 10); + break; case 's': s.input_std = strtod(optarg, NULL); break; diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h index 4de32e33fb4..4b48014e1c7 100644 --- a/tensorflow/contrib/lite/examples/label_image/label_image.h +++ b/tensorflow/contrib/lite/examples/label_image/label_image.h @@ -25,6 +25,7 @@ struct Settings { bool verbose = false; bool accel = false; bool input_floating = false; + bool profiling = false; int loop_count = 1; float input_mean = 127.5f; float input_std = 127.5f; From 9e0037513040fd09ee01442bd062936b41bee40c Mon Sep 17 00:00:00 2001 From: SukHwan Kim <30820468+jerry4897@users.noreply.github.com> Date: Fri, 20 Apr 2018 18:24:52 +0900 Subject: [PATCH 0506/1734] Update c_api_test.cc Typo --- tensorflow/c/c_api_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index ca80db23ed3..9b86425aa5f 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) { TestGradientsError(false); } -// REGISTER_OP for CApiTestAttributesTest test cases. +// REGISTER_OP for CApiAttributesTest test cases. // Registers two ops, each with a single attribute called 'v'. // The attribute in one op will have a type 'type', the other // will have list(type). From 1ad32703d4e728d8fba835aaf24418f19cf85dbe Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Fri, 20 Apr 2018 03:29:31 -0700 Subject: [PATCH 0507/1734] [TF:XLA] Implement ClipByValue. PiperOrigin-RevId: 193646890 --- tensorflow/compiler/tests/ternary_ops_test.py | 18 ++++++ tensorflow/compiler/tf2xla/kernels/BUILD | 1 + .../tf2xla/kernels/clip_by_value_op.cc | 61 +++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py index ba5f829936f..75a2cf07c5a 100644 --- a/tensorflow/compiler/tests/ternary_ops_test.py +++ b/tensorflow/compiler/tests/ternary_ops_test.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.compiler.tests.xla_test import XLATestCase from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest @@ -119,6 +120,23 @@ class TernaryOpsTest(XLATestCase): np.array([2, 1], dtype=np.int32), expected=np.array([[2], [5]], dtype=dtype)) + def testClipByValue(self): + # TODO(b/78258593): enable integer types here too. + for dtype in self.float_types: + test_cases = [ + (np.array([2, 4, 5], dtype=dtype), dtype(7)), # + (dtype(1), np.array([2, 4, 5], dtype=dtype)), # + (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype)) + ] + x = np.array([-2, 10, 6], dtype=dtype) + for lower, upper in test_cases: + self._testTernary( + gen_math_ops._clip_by_value, + x, + lower, + upper, + expected=np.minimum(np.maximum(x, lower), upper)) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 579b6696999..00fd08b1a07 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -21,6 +21,7 @@ tf_kernel_library( "cast_op.cc", "categorical_op.cc", "cholesky_op.cc", + "clip_by_value_op.cc", "concat_op.cc", "const_op.cc", "conv_ops.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc new file mode 100644 index 00000000000..fdf75be7b11 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc @@ -0,0 +1,61 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace { + +class ClipByValueOp : public XlaOpKernel { + public: + explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + + void Compile(XlaOpKernelContext* ctx) override { + const TensorShape shape = ctx->InputShape(0); + const TensorShape min_shape = ctx->InputShape(1); + const TensorShape max_shape = ctx->InputShape(2); + + xla::ComputationBuilder* builder = ctx->builder(); + auto input = ctx->Input(0); + auto min = ctx->Input(1); + auto max = ctx->Input(2); + + auto shape_error = [&]() -> tensorflow::Status { + return errors::InvalidArgument( + "clip_value_min and clip_value_max must be either of " + "the same shape as input, or a scalar. ", + "Input shape: ", shape.DebugString(), + " clip_value_min shape: ", min_shape.DebugString(), + " clip_value_max shape: ", max_shape.DebugString()); + }; + + if (shape != min_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error()); + min = builder->Broadcast(min, shape.dim_sizes()); + } + if (shape != max_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error()); + max = builder->Broadcast(max, shape.dim_sizes()); + } + ctx->SetOutput(0, builder->Clamp(min, input, max)); + } +}; + +REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp); + +} // namespace +} // namespace tensorflow From f0df6701d01954073e912f24f7c983de4f091a1e Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 14:01:02 +0300 Subject: [PATCH 0508/1734] [tf.data] Check in a strictly faster rejection resampling transformation. This transformation is faster because it rejects fewer data. This is done by occasionally sampling from the original data distribution in an efficient way. Tested: bazel test :resample_test --- .../data/python/kernel_tests/resample_test.py | 128 +++++++-- .../contrib/data/python/ops/resampling.py | 271 ++++++++++++++---- 2 files changed, 329 insertions(+), 70 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py index 5f47dcb3399..9e1273eba13 100644 --- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py @@ -18,6 +18,8 @@ from __future__ import division from __future__ import print_function import numpy as np +import time +from absl.testing import parameterized from tensorflow.contrib.data.python.ops import resampling from tensorflow.python.data.ops import dataset_ops @@ -30,47 +32,70 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -class ResampleTest(test.TestCase): +def _time_resampling( + test_obj, data_np, target_dist, init_dist, use_v2, num_to_sample): + dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() - def testInitialKnownDistribution(self): - self._testDistribution(initial_known=True) + # Reshape distribution via rejection sampling. + apply_fn = (resampling.rejection_resample_v2 if use_v2 else + resampling.rejection_resample) + dataset = dataset.apply( + apply_fn( + class_func=lambda x: x, + target_dist=target_dist, + initial_dist=init_dist, + seed=142)) - def testInitialNotKnownDistribution(self): - self._testDistribution(initial_known=False) + get_next = dataset.make_one_shot_iterator().get_next() - def _testDistribution(self, initial_known): + with test_obj.test_session() as sess: + start_time = time.time() + for _ in xrange(num_to_sample): + sess.run(get_next) + end_time = time.time() + + return end_time - start_time + + +class ResampleTest(test.TestCase, parameterized.TestCase): + + @parameterized.named_parameters( + ('InitialnDistributionKnown', True, False), + ('InitialDistributionUnknown', False, False), + ('InitialDistributionKnownV2', True, True), + ('InitialDistributionUnknownV2', False, True)) + def testDistribution(self, initial_known, use_v2): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] initial_dist = [0.2] * 5 if initial_known else None - iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle( - 200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply( - resampling.rejection_resample( - target_dist=target_dist, - initial_dist=initial_dist, - class_func=lambda c, _: c, - seed=27)).make_one_shot_iterator()) - get_next = iterator.get_next() + dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle( + 200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat() + apply_fn = (resampling.rejection_resample_v2 if use_v2 else + resampling.rejection_resample) + get_next = dataset.apply( + apply_fn( + target_dist=target_dist, + initial_dist=initial_dist, + class_func=lambda c, _: c, + seed=27)).make_one_shot_iterator().get_next() with self.test_session() as sess: returned = [] - with self.assertRaises(errors.OutOfRangeError): - while True: - returned.append(sess.run(get_next)) + while len(returned) < 4000: + returned.append(sess.run(get_next)) returned_classes, returned_classes_and_data = zip(*returned) _, returned_data = zip(*returned_classes_and_data) self.assertAllEqual([compat.as_bytes(str(c)) for c in returned_classes], returned_data) total_returned = len(returned_classes) - # Subsampling rejects a large percentage of the initial data in - # this case. - self.assertGreater(total_returned, 20000 * 0.2) class_counts = np.array([ len([True for v in returned_classes if v == c]) for c in range(5)]) returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2) + def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] @@ -109,5 +134,68 @@ class ResampleTest(test.TestCase): self.assertAllClose(target_dist, bincount, atol=1e-2) + @parameterized.named_parameters( + ('InitialnDistributionKnown', True, False), + ('InitialDistributionUnknown', False, False), + ('InitialDistributionKnownV2', True, True), + ('InitialDistributionUnknownV2', False, True)) + def _testNewResampleIsFaster(self, target_dist, num_to_sample): + init_dist = [0.25, 0.25, 0.25, 0.25] + num_classes = len(init_dist) + num_samples = 1000 + data_np = np.random.choice(num_classes, num_samples, p=init_dist) + + fast_time = _time_resampling(self, data_np, target_dist, init_dist, + use_v2=True, num_to_sample=num_to_sample) + slow_time = _time_resampling(self, data_np, target_dist, init_dist, + use_v2=False, num_to_sample=num_to_sample) + + self.assertLess(fast_time, slow_time) + + + def testNewResampleIsFasterSmallSkewManySamples(self): + self._testNewResampleIsFaster([0.1, 0.1, 0.1, 0.7], 1000) + + def testNewResampleIsFasterBigSkewManySamples(self): + self._testNewResampleIsFaster([0.01, 0.01, 0.01, 0.97], 1000) + + def testNewResampleIsFasterSmallSkewFewSamples(self): + self._testNewResampleIsFaster([0.1, 0.1, 0.1, 0.7], 100) + + def testNewResampleIsFasterBigSkewFewSamples(self): + self._testNewResampleIsFaster([0.01, 0.01, 0.01, 0.97], 100) + + +class MapDatasetBenchmark(test.Benchmark): + + def benchmarkResamplePerformance(self): + init_dist = [0.25, 0.25, 0.25, 0.25] + target_dist = [0.0, 0.0, 0.0, 1.0] + num_classes = len(init_dist) + # We don't need many samples to test a dirac-delta target distribution + num_samples = 1000 + data_np = np.random.choice(num_classes, num_samples, p=init_dist) + + resample_time = _time_resampling( + self, data_np, target_dist, init_dist, use_v2=False, num_to_sample=1000) + + self.report_benchmark( + iters=1000, wall_time=resample_time, name="benchmark_resample") + + def benchmarkResampleAndBatchPerformance(self): + init_dist = [0.25, 0.25, 0.25, 0.25] + target_dist = [0.0, 0.0, 0.0, 1.0] + num_classes = len(init_dist) + # We don't need many samples to test a dirac-delta target distribution + num_samples = 1000 + data_np = np.random.choice(num_classes, num_samples, p=init_dist) + + resample_time = _time_resampling( + self, data_np, target_dist, init_dist, use_v2=True, num_to_sample=1000) + + self.report_benchmark( + iters=1000, wall_time=resample_time, name="benchmark_resample_v2") + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py index b465397437a..94e28b9a2da 100644 --- a/tensorflow/contrib/data/python/ops/resampling.py +++ b/tensorflow/contrib/data/python/ops/resampling.py @@ -20,6 +20,7 @@ from __future__ import print_function import numpy as np from tensorflow.contrib.data.python.ops import batching +from tensorflow.contrib.data.python.ops import interleave_ops from tensorflow.contrib.data.python.ops import scan_ops from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import dtypes @@ -50,14 +51,15 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): A `Dataset` transformation function, which can be passed to @{tf.data.Dataset.apply}. """ - def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" - dist_estimation_batch_size = 32 target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist") class_values_ds = dataset.map(class_func) + + # Get initial distribution. if initial_dist is not None: - initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist") + initial_dist_t = ops.convert_to_tensor( + initial_dist, name="initial_dist") acceptance_dist = _calculate_acceptance_probs(initial_dist_t, target_dist_t) initial_dist_ds = dataset_ops.Dataset.from_tensors( @@ -65,55 +67,181 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): acceptance_dist_ds = dataset_ops.Dataset.from_tensors( acceptance_dist).repeat() else: - num_classes = (target_dist_t.shape[0].value or - array_ops.shape(target_dist_t)[0]) - smoothing_constant = 10 - initial_examples_per_class_seen = array_ops.fill( - [num_classes], np.int64(smoothing_constant)) - - def update_estimate_and_tile(num_examples_per_class_seen, c): - updated_examples_per_class_seen, dist = _estimate_data_distribution( - c, num_examples_per_class_seen) - tiled_dist = array_ops.tile( - array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1]) - return updated_examples_per_class_seen, tiled_dist - - initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size) - .apply(scan_ops.scan(initial_examples_per_class_seen, - update_estimate_and_tile)) - .apply(batching.unbatch())) + initial_dist_ds = _estimate_initial_dist_ds( + target_dist_t, class_values_ds) acceptance_dist_ds = initial_dist_ds.map( lambda initial: _calculate_acceptance_probs(initial, target_dist_t)) - - def maybe_warn_on_large_rejection(accept_dist, initial_dist): - proportion_rejected = math_ops.reduce_sum( - (1 - accept_dist) * initial_dist) - return control_flow_ops.cond( - math_ops.less(proportion_rejected, .5), - lambda: accept_dist, - lambda: logging_ops.Print( # pylint: disable=g-long-lambda - accept_dist, [proportion_rejected, initial_dist, accept_dist], - message="Proportion of examples rejected by sampler is high: ", - summarize=100, - first_n=10)) - - acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds, - initial_dist_ds)) - .map(maybe_warn_on_large_rejection)) - - def _gather_and_copy(class_val, acceptance_prob, data): - return (class_val, array_ops.gather(acceptance_prob, class_val), data) - current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip( - (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy) - filtered_ds = ( - current_probabilities_and_class_and_data_ds - .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p)) - return filtered_ds.map(lambda class_value, _, data: (class_value, data)) - + return _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, + class_values_ds, seed) return _apply_fn +def rejection_resample_v2(class_func, target_dist, initial_dist=None, + seed=None): + """A transformation that resamples a dataset to achieve a target distribution. + + This differs from v1 in that it will also sample from the original dataset + with some probability, so it makes strictly fewer data rejections. This + transformation is faster than the original. + + **NOTE** Resampling is performed via rejection sampling; some fraction + of the input values will be dropped. + + Args: + class_func: A function mapping an element of the input dataset to a scalar + `tf.int32` tensor. Values should be in `[0, num_classes)`. + target_dist: A floating point type tensor, shaped `[num_classes]`. + initial_dist: (Optional.) A floating point type tensor, shaped + `[num_classes]`. If not provided, the true class distribution is + estimated live in a streaming fashion. + seed: (Optional.) Python integer seed for the resampler. + + Returns: + A `Dataset` transformation function, which can be passed to + @{tf.data.Dataset.apply}. + """ + def _apply_fn(dataset): + """Function from `Dataset` to `Dataset` that applies the transformation.""" + target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist") + class_values_ds = dataset.map(class_func) + + # Get initial distribution. + if initial_dist is not None: + initial_dist_t = ops.convert_to_tensor( + initial_dist, name="initial_dist") + acceptance_dist, prob_of_original = ( + _calculate_acceptance_probs_with_mixing(initial_dist_t, + target_dist_t)) + initial_dist_ds = dataset_ops.Dataset.from_tensors( + initial_dist_t).repeat() + acceptance_dist_ds = dataset_ops.Dataset.from_tensors( + acceptance_dist).repeat() + prob_of_original_ds = dataset_ops.Dataset.from_tensors( + prob_of_original).repeat() + else: + initial_dist_ds = _estimate_initial_dist_ds( + target_dist_t, class_values_ds) + acceptance_and_original_prob_ds = initial_dist_ds.map( + lambda initial: _calculate_acceptance_probs_with_mixing( + initial, target_dist_t)) + acceptance_dist_ds = acceptance_and_original_prob_ds.map( + lambda accept_prob, _: accept_prob) + prob_of_original_ds = acceptance_and_original_prob_ds.map( + lambda _, prob_original: prob_original) + filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, + class_values_ds, seed) + # Prefetch filtered dataset for speed. + filtered_ds = filtered_ds.prefetch(3) + + return interleave_ops.sample_from_datasets( + [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds], + weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]), + seed=seed) + + return _apply_fn + + +def _random_interleave_datasets(ds1, ds1_classes, ds2, prob_of_ds1, seed=None): + """Randomly interleave datasets. + + We carefully combine `ds1` and 'ds2' so that we don't needlessly compute the + filtering. + + Args: + ds1: A dataset to interleave. + ds1_classes: Dataset of class values associated with ds1. + ds2: Another dataset to interleave. + prob_of_ds1: A dataset of probabilities. Each probability represents the + likelihood of drawing from `ds1`. + seed: (Optional.) Python integer seed for the resampler. + + Returns: + A single dataset, combined from `ds1` and `ds2`. + """ + num_filtered_to_prefetch = 3 + ds2 = ds2.prefetch(num_filtered_to_prefetch) + filtered_iterator = ds2.make_one_shot_iterator() + combined_ds = dataset_ops.Dataset.zip( + (ds1_classes, ds1, prob_of_ds1)).map( + lambda ds1_class, original_data, prob_of_original: + control_flow_ops.cond( + random_ops.random_uniform([], seed=seed) < prob_of_original, + lambda: (ds1_class, original_data), + filtered_iterator.get_next)) + return combined_ds + + +def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds, + seed): + """Filters a dataset based on per-class acceptance probabilities. + + Args: + dataset: The dataset to be filtered. + acceptance_dist_ds: A dataset of acceptance probabilities. + initial_dist_ds: A dataset of the initial probability distribution, given or + estimated. + class_values_ds: A dataset of the corresponding classes. + seed: (Optional.) Python integer seed for the resampler. + + Returns: + A dataset of (class value, data) after filtering. + """ + def maybe_warn_on_large_rejection(accept_dist, initial_dist): + proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist) + return control_flow_ops.cond( + math_ops.less(proportion_rejected, .5), + lambda: accept_dist, + lambda: logging_ops.Print( # pylint: disable=g-long-lambda + accept_dist, [proportion_rejected, initial_dist, accept_dist], + message="Proportion of examples rejected by sampler is high: ", + summarize=100, + first_n=10)) + + acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds, + initial_dist_ds)) + .map(maybe_warn_on_large_rejection)) + + def _gather_and_copy(class_val, acceptance_prob, data): + return class_val, array_ops.gather(acceptance_prob, class_val), data + + current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip( + (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy) + filtered_ds = ( + current_probabilities_and_class_and_data_ds + .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p)) + return filtered_ds.map(lambda class_value, _, data: (class_value, data)) + + +def _estimate_initial_dist_ds( + target_dist_t, class_values_ds, dist_estimation_batch_size=32, + smoothing_constant=10): + num_classes = (target_dist_t.shape[0].value or + array_ops.shape(target_dist_t)[0]) + initial_examples_per_class_seen = array_ops.fill( + [num_classes], np.int64(smoothing_constant)) + + def update_estimate_and_tile(num_examples_per_class_seen, c): + updated_examples_per_class_seen, dist = _estimate_data_distribution( + c, num_examples_per_class_seen) + tiled_dist = array_ops.tile( + array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1]) + return updated_examples_per_class_seen, tiled_dist + + initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size) + .apply(scan_ops.scan(initial_examples_per_class_seen, + update_estimate_and_tile)) + .apply(batching.unbatch())) + + return initial_dist_ds + + +def _get_target_to_initial_ratio(initial_probs, target_probs): + # Add tiny to initial_probs to avoid divide by zero. + denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny) + return target_probs / denom + + def _calculate_acceptance_probs(initial_probs, target_probs): """Calculate the per-class acceptance rates. @@ -152,13 +280,10 @@ def _calculate_acceptance_probs(initial_probs, target_probs): 0 <= t_i <= 1, sum_i(t_i) = 1 ``` - A solution for a_i in terms of the other variables is the following: ```a_i = (t_i / p_i) / max_i[t_i / p_i]``` """ - # Add tiny to initial_probs to avoid divide by zero. - denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny) - ratio_l = target_probs / denom + ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs) # Calculate list of acceptance probabilities. max_ratio = math_ops.reduce_max(ratio_l) @@ -188,3 +313,49 @@ def _estimate_data_distribution(c, num_examples_per_class_seen): math_ops.reduce_sum(num_examples_per_class_seen)) dist = math_ops.cast(init_prob_estimate, dtypes.float32) return num_examples_per_class_seen, dist + + +def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs): + """Calculates the acceptance probabilities and mixing ratio. + + In this case, we assume that we can *either* sample from the original data + distribution with probability `m`, or sample from a reshaped distribution + that comes from rejection sampling on the original distribution. This + rejection sampling is done on a per-class basis, with `a_i` representing the + probability of accepting data from class `i`. + + If we try to minimize the amount of data rejected, we get the following: + + M_max = max_i [ t_i / p_i ] + M_min = min_i [ t_i / p_i ] + + The desired probability of accepting data if it comes from class `i`: + + a_i = (t_i/p_i - m) / (M_max - m) + + The desired probability of pulling a data element from the original dataset, + rather than the filtered one: + + m = M_min + + See the docstring for `_calculate_acceptance_probs` for more details. + + Args: + initial_probs: A Tensor of the initial probability distribution, given or + estimated. + target_probs: A Tensor of the corresponding classes. + + Returns: + (A 1D Tensor with the per-class acceptance probabilities, the desired + probability of pull from the original distribution.) + """ + ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs) + max_ratio = math_ops.reduce_max(ratio_l) + min_ratio = math_ops.reduce_min(ratio_l) + + # Target prob to sample from original distribution. + m = min_ratio + + # TODO(joelshor): Simplify fraction, if possible. + a_i = (ratio_l - m) / (max_ratio - m) + return a_i, m From b1067116c6a2351f4c597a9391b21ad0f513565b Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 14:27:30 +0300 Subject: [PATCH 0509/1734] [tf.data] Clean up resampler and update BUILD files. --- .../contrib/data/python/kernel_tests/BUILD | 6 ++- .../data/python/kernel_tests/resample_test.py | 32 +++++---------- tensorflow/contrib/data/python/ops/BUILD | 2 + .../contrib/data/python/ops/resampling.py | 40 ++++--------------- 4 files changed, 23 insertions(+), 57 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index b15b9663f4c..a6b46b37e77 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -308,13 +308,17 @@ py_test( srcs_version = "PY2AND3", tags = ["noasan"], deps = [ + "//third_party/py/absl/testing:parameterized", + "//third_party/py/numpy", "//tensorflow/contrib/data/python/ops:resampling", "//tensorflow/python:client_testlib", + "//tensorflow/python:dtypes", "//tensorflow/python:errors", + "//tensorflow/python:math_ops", + "//tensorflow/python:random_ops", "//tensorflow/python:string_ops", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", - "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py index 9e1273eba13..97c4b68cb64 100644 --- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py @@ -60,10 +60,10 @@ def _time_resampling( class ResampleTest(test.TestCase, parameterized.TestCase): @parameterized.named_parameters( - ('InitialnDistributionKnown', True, False), - ('InitialDistributionUnknown', False, False), - ('InitialDistributionKnownV2', True, True), - ('InitialDistributionUnknownV2', False, True)) + ("InitialnDistributionKnown", True, False), + ("InitialDistributionUnknown", False, False), + ("InitialDistributionKnownV2", True, True), + ("InitialDistributionUnknownV2", False, True)) def testDistribution(self, initial_known, use_v2): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] @@ -95,7 +95,6 @@ class ResampleTest(test.TestCase, parameterized.TestCase): returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2) - def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] @@ -135,11 +134,11 @@ class ResampleTest(test.TestCase, parameterized.TestCase): self.assertAllClose(target_dist, bincount, atol=1e-2) @parameterized.named_parameters( - ('InitialnDistributionKnown', True, False), - ('InitialDistributionUnknown', False, False), - ('InitialDistributionKnownV2', True, True), - ('InitialDistributionUnknownV2', False, True)) - def _testNewResampleIsFaster(self, target_dist, num_to_sample): + ("SmallSkewManySamples", [0.1, 0.1, 0.1, 0.7], 1000), + ("BigSkewManySamples", [0.01, 0.01, 0.01, 0.97], 1000), + ("SmallSkewFewSamples", [0.1, 0.1, 0.1, 0.7], 100), + ("BigSkewFewSamples", [0.01, 0.01, 0.01, 0.97], 100)) + def testNewResampleIsFaster(self, target_dist, num_to_sample): init_dist = [0.25, 0.25, 0.25, 0.25] num_classes = len(init_dist) num_samples = 1000 @@ -153,19 +152,6 @@ class ResampleTest(test.TestCase, parameterized.TestCase): self.assertLess(fast_time, slow_time) - def testNewResampleIsFasterSmallSkewManySamples(self): - self._testNewResampleIsFaster([0.1, 0.1, 0.1, 0.7], 1000) - - def testNewResampleIsFasterBigSkewManySamples(self): - self._testNewResampleIsFaster([0.01, 0.01, 0.01, 0.97], 1000) - - def testNewResampleIsFasterSmallSkewFewSamples(self): - self._testNewResampleIsFaster([0.1, 0.1, 0.1, 0.7], 100) - - def testNewResampleIsFasterBigSkewFewSamples(self): - self._testNewResampleIsFaster([0.01, 0.01, 0.01, 0.97], 100) - - class MapDatasetBenchmark(test.Benchmark): def benchmarkResamplePerformance(self): diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index e00f2304cc4..8cb4fa7f149 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -193,7 +193,9 @@ py_library( srcs_version = "PY2AND3", deps = [ ":batching", + ":interleave_ops", ":scan_ops", + "//third_party/py/numpy", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", "//tensorflow/python:dtypes", diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py index 94e28b9a2da..16d851bf964 100644 --- a/tensorflow/contrib/data/python/ops/resampling.py +++ b/tensorflow/contrib/data/python/ops/resampling.py @@ -82,8 +82,12 @@ def rejection_resample_v2(class_func, target_dist, initial_dist=None, """A transformation that resamples a dataset to achieve a target distribution. This differs from v1 in that it will also sample from the original dataset - with some probability, so it makes strictly fewer data rejections. This - transformation is faster than the original. + with some probability, so it makes strictly fewer data rejections. Due to an + implementation detail it must initialize a separate dataset initializer, so + the dataset becomes stateful after this transformation is applied + (`make_one_shot_iterator` won't work; users must use + `make_initializable_iterator`). This transformation is faster than the + original, except for overhead. **NOTE** Resampling is performed via rejection sampling; some fraction of the input values will be dropped. @@ -142,36 +146,6 @@ def rejection_resample_v2(class_func, target_dist, initial_dist=None, return _apply_fn -def _random_interleave_datasets(ds1, ds1_classes, ds2, prob_of_ds1, seed=None): - """Randomly interleave datasets. - - We carefully combine `ds1` and 'ds2' so that we don't needlessly compute the - filtering. - - Args: - ds1: A dataset to interleave. - ds1_classes: Dataset of class values associated with ds1. - ds2: Another dataset to interleave. - prob_of_ds1: A dataset of probabilities. Each probability represents the - likelihood of drawing from `ds1`. - seed: (Optional.) Python integer seed for the resampler. - - Returns: - A single dataset, combined from `ds1` and `ds2`. - """ - num_filtered_to_prefetch = 3 - ds2 = ds2.prefetch(num_filtered_to_prefetch) - filtered_iterator = ds2.make_one_shot_iterator() - combined_ds = dataset_ops.Dataset.zip( - (ds1_classes, ds1, prob_of_ds1)).map( - lambda ds1_class, original_data, prob_of_original: - control_flow_ops.cond( - random_ops.random_uniform([], seed=seed) < prob_of_original, - lambda: (ds1_class, original_data), - filtered_iterator.get_next)) - return combined_ds - - def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds, seed): """Filters a dataset based on per-class acceptance probabilities. @@ -358,4 +332,4 @@ def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs): # TODO(joelshor): Simplify fraction, if possible. a_i = (ratio_l - m) / (max_ratio - m) - return a_i, m + return a_i, m \ No newline at end of file From 0cba8b7c66bead25ed2e6e1c6bf5a23d6cbe9557 Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 14:44:47 +0300 Subject: [PATCH 0510/1734] [tf.data] Fix `absl` build rule. --- tensorflow/contrib/data/python/kernel_tests/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index a6b46b37e77..f90b17e79ee 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -308,7 +308,6 @@ py_test( srcs_version = "PY2AND3", tags = ["noasan"], deps = [ - "//third_party/py/absl/testing:parameterized", "//third_party/py/numpy", "//tensorflow/contrib/data/python/ops:resampling", "//tensorflow/python:client_testlib", @@ -319,6 +318,7 @@ py_test( "//tensorflow/python:string_ops", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", + "@absl_py//absl/testing:parameterized", ], ) From 8cc506f8f6c3e9071069ede1cd5c91a9f3da7c11 Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 15:00:02 +0300 Subject: [PATCH 0511/1734] [tf.data] Reorder BUILD rule deps and add `xrange` from `six`. --- tensorflow/contrib/data/python/kernel_tests/BUILD | 2 +- tensorflow/contrib/data/python/kernel_tests/resample_test.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index f90b17e79ee..92c69679338 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -308,7 +308,6 @@ py_test( srcs_version = "PY2AND3", tags = ["noasan"], deps = [ - "//third_party/py/numpy", "//tensorflow/contrib/data/python/ops:resampling", "//tensorflow/python:client_testlib", "//tensorflow/python:dtypes", @@ -318,6 +317,7 @@ py_test( "//tensorflow/python:string_ops", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", + "//third_party/py/numpy", "@absl_py//absl/testing:parameterized", ], ) diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py index 97c4b68cb64..7f007fede8c 100644 --- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin import time from absl.testing import parameterized From a10708db0d587831cafcb2e7dbdcbbcf11aede95 Mon Sep 17 00:00:00 2001 From: joel-shor Date: Fri, 20 Apr 2018 15:09:50 +0300 Subject: [PATCH 0512/1734] [tf.data] Second reorder BUILD rule deps. --- tensorflow/contrib/data/python/ops/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index 8cb4fa7f149..d9a55025080 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -195,7 +195,6 @@ py_library( ":batching", ":interleave_ops", ":scan_ops", - "//third_party/py/numpy", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", "//tensorflow/python:dtypes", @@ -204,6 +203,7 @@ py_library( "//tensorflow/python:math_ops", "//tensorflow/python:random_ops", "//tensorflow/python/data/ops:dataset_ops", + "//third_party/py/numpy", ], ) From 0c03255aa5f4b37de97e0685ffa15888fc16e4b3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 06:36:56 -0700 Subject: [PATCH 0513/1734] internal change PiperOrigin-RevId: 193659701 --- .../lite/toco/graph_transformations/propagate_fixed_sizes.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index b34aca1f091..ba244cf5ef5 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -1516,10 +1516,7 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) { return; } - // The current ArgMax implementation only supports 4-dimensional inputs with - // the last dimension as the axis to perform ArgMax for. const std::vector& input_dims = input_array.shape().dims(); - CHECK_EQ(input_dims.size(), 4); std::vector output_dims; output_dims.reserve(input_dims.size() - 1); From c212d5542bb666b613a8567338983288a3ab15f4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 08:08:01 -0700 Subject: [PATCH 0514/1734] Eliminate the guard around Winograd non-fused convolutions with cudnn7. PiperOrigin-RevId: 193669636 --- .../fused_conv2d_bias_activation_op.cc | 3 +- .../core/kernels/conv_grad_filter_ops.cc | 3 +- .../core/kernels/conv_grad_input_ops.cc | 3 +- tensorflow/core/kernels/conv_grad_ops_3d.cc | 8 +++-- tensorflow/core/kernels/conv_ops.cc | 3 +- tensorflow/core/kernels/conv_ops_3d.cc | 4 ++- tensorflow/core/kernels/conv_ops_gpu.h | 35 +++++++++++++------ tensorflow/core/kernels/conv_ops_test.cc | 26 +++++++++----- 8 files changed, 59 insertions(+), 26 deletions(-) diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc index 0e06575d96f..1e8f011b5d8 100644 --- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc @@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp:: fused_conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), + fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), &algorithms)); dnn::ProfileResult best_result; dnn::ProfileResult best_result_no_scratch; diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index 66ee474ca3f..f3b91494b97 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -912,7 +912,8 @@ void LaunchConv2DBackpropFilterOp::operator()( conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo(stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index 71ea0d5d720..66d15c6e787 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -961,7 +961,8 @@ void LaunchConv2DBackpropInputOp::operator()( conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo(stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index 3650ab53b25..1234997bc57 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -662,7 +662,9 @@ class Conv3DBackpropInputOp : public OpKernel { conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { @@ -1029,7 +1031,9 @@ class Conv3DBackpropFilterOp : public OpKernel { conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 88843e4da78..f0888c655fe 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -710,7 +710,8 @@ void LaunchConv2DOp::operator()( !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo(stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index 21c84b2a0ed..0b7c1524e65 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -396,7 +396,9 @@ struct LaunchConvOp { conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), + &algorithms)); ProfileResult best_result; ProfileResult best_result_no_scratch; for (auto profile_algorithm : algorithms) { diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index f0085be3a53..7f9cfec981f 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -137,20 +137,18 @@ class ConvParameters { // clang-format on } - // TODO(yangzihao): The purpose of this function is to disable winograd - // nonfused conv algorithm for certain input parameters so as to avoid a bug - // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7. + // The purpose of this function is to disable winograd nonfused conv algorithm + // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6. template - bool ShouldIncludeWinogradNonfusedAlgo() const { - int64 total_size = 16 * std::ceil(batch_ / 16.0) * - std::max(in_depths_, out_depths_) * in_[0] * in_[1] * - sizeof(T); - int64 threshold = 1LL << 31; - if (total_size >= threshold) { - return false; - } else { + bool ShouldIncludeWinogradNonfusedAlgo( + perftools::gputools::StreamExecutor* stream_exec) const { + // Skip this check for cuDNN 7 and newer. + perftools::gputools::port::StatusOr> version = + stream_exec->AsDnn()->GetVersion(); + if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) { return true; } + return ShouldIncludeWinogradNonfusedAlgoPreCudnn7(); } protected: @@ -166,6 +164,21 @@ class ConvParameters { uint64 hash_code_; private: + friend struct ConvParametersPeer; // For testing purposes. + + template + bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const { + int64 total_size = 16 * std::ceil(batch_ / 16.0) * + std::max(in_depths_, out_depths_) * in_[0] * in_[1] * + sizeof(T); + int64 threshold = 1LL << 31; + if (total_size >= threshold) { + return false; + } else { + return true; + } + } + int64 batch_; int64 in_depths_; int64 out_depths_; diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc index e2e166c02fe..8afe6a2cbdf 100644 --- a/tensorflow/core/kernels/conv_ops_test.cc +++ b/tensorflow/core/kernels/conv_ops_test.cc @@ -22,20 +22,28 @@ limitations under the License. #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/conv_ops_gpu.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/public/session.h" -#include "tensorflow/core/kernels/conv_ops_gpu.h" - namespace tensorflow { #if GOOGLE_CUDA +struct ConvParametersPeer { + template + bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() { + return params.ShouldIncludeWinogradNonfusedAlgoPreCudnn7(); + } + + ConvParameters params; +}; + TEST(ConvParameters, WinogradNonfusedAlgoSize) { - ConvParameters conv_params_small = { + ConvParametersPeer conv_params_small = {{ 1, // batch 32, // in_depths {{300, // in_rows @@ -51,10 +59,11 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) { 0}}, // padding_cols DT_FLOAT, // tensor datatype 0, // device_id - }; - EXPECT_TRUE(conv_params_small.ShouldIncludeWinogradNonfusedAlgo()); + }}; + EXPECT_TRUE( + conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7()); - ConvParameters conv_params_large = { + ConvParametersPeer conv_params_large = {{ 1, // batch 128, // in_depths {{300, // in_rows @@ -70,8 +79,9 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) { 0}}, // padding_cols DT_FLOAT, // tensor datatype 0, // device_id - }; - EXPECT_FALSE(conv_params_large.ShouldIncludeWinogradNonfusedAlgo()); + }}; + EXPECT_FALSE( + conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7()); } #endif // GOOGLE_CUDA From 3e20fee5810796f70713122d235176b9c022ef41 Mon Sep 17 00:00:00 2001 From: Junpeng Lao Date: Fri, 20 Apr 2018 18:05:52 +0200 Subject: [PATCH 0515/1734] Address comments from @srvasude --- .../kernel_tests/bijectors/ordered_test.py | 32 +++++++++++-------- .../python/ops/bijectors/ordered.py | 21 ++++++++---- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py index 63c8f1fb316..721dba9c3ad 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py @@ -1,4 +1,4 @@ -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,33 +23,36 @@ import numpy as np from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite from tensorflow.python.platform import test -rng = np.random.RandomState(42) - class OrderedBijectorTest(test.TestCase): """Tests correctness of the ordered transformation.""" + def setUp(self): + self._rng = np.random.RandomState(42) + + @test_util.run_in_graph_and_eager_modes() def testBijectorVector(self): with self.test_session(): ordered = Ordered() self.assertEqual("ordered", ordered.name) x = np.asarray([[2., 3, 4], [4., 8, 13]]) y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]] - self.assertAllClose(y, ordered.forward(x).eval()) - self.assertAllClose(x, ordered.inverse(y).eval()) + self.assertAllClose(y, self.evaluate(ordered.forward(x))) + self.assertAllClose(x, self.evaluate(ordered.inverse(y))) self.assertAllClose( np.sum(np.asarray(y)[..., 1:], axis=-1), - ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(), + self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)), atol=0., rtol=1e-7) self.assertAllClose( - -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(), - ordered.forward_log_det_jacobian(x, event_ndims=1).eval(), + self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)), + self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)), atol=0., rtol=1e-7) @@ -79,6 +82,7 @@ class OrderedBijectorTest(test.TestCase): atol=0., rtol=1e-7) + @test_util.run_in_graph_and_eager_modes() def testShapeGetters(self): with self.test_session(): x = tensor_shape.TensorShape([4]) @@ -86,18 +90,18 @@ class OrderedBijectorTest(test.TestCase): bijector = Ordered(validate_args=True) self.assertAllEqual(y, bijector.forward_event_shape(x)) self.assertAllEqual(y.as_list(), - bijector.forward_event_shape_tensor( - x.as_list()).eval()) + self.evaluate(bijector.forward_event_shape_tensor( + x.as_list()))) self.assertAllEqual(x, bijector.inverse_event_shape(y)) self.assertAllEqual(x.as_list(), - bijector.inverse_event_shape_tensor( - y.as_list()).eval()) + self.evaluate(bijector.inverse_event_shape_tensor( + y.as_list()))) def testBijectiveAndFinite(self): with self.test_session(): ordered = Ordered() - x = np.sort(rng.randn(3, 10), axis=-1).astype(np.float32) - y = (rng.randn(3, 10)).astype(np.float32) + x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32) + y = (self._rng.randn(3, 10)).astype(np.float32) assert_bijective_and_finite(ordered, x, y, event_ndims=1) diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py index b2959cce31b..46fec0562c9 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py @@ -36,6 +36,8 @@ class Ordered(bijector.Bijector): """Bijector which maps a tensor x_k that has increasing elements in the last dimension to an unconstrained tensor y_k. + Both the domain and the codomain of the mapping is [-inf, inf], however, + the input of the forward mapping must be strictly increasing. The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)` gives back a sorted random vector with the same distribution `x ~ N(0, 1)` where `x = sort(y)` @@ -55,11 +57,7 @@ class Ordered(bijector.Bijector): ``` """ - def __init__(self, - validate_args=False, - name="ordered"): - self._graph_parents = [] - self._name = name + def __init__(self, validate_args=False, name="ordered"): super(Ordered, self).__init__( forward_min_event_ndims=1, validate_args=validate_args, @@ -90,21 +88,30 @@ class Ordered(bijector.Bijector): def _forward(self, x): x = self._maybe_assert_valid_x(x) - y0 = array_ops.expand_dims(x[..., 0], -1) + y0 = x[..., 0, array_ops.newaxis] yk = math_ops.log(x[..., 1:] - x[..., :-1]) y = array_ops.concat([y0, yk], axis=-1) return y def _inverse(self, y): - x0 = array_ops.expand_dims(y[..., 0], -1) + x0 = y[..., 0, array_ops.newaxis] xk = math_ops.exp(y[..., 1:]) x = array_ops.concat([x0, xk], axis=-1) return math_ops.cumsum(x, axis=-1) def _inverse_log_det_jacobian(self, y): + # The Jacobian of the inverse mapping is lower + # triangular, with the diagonal elements being: + # J[i,i] = 1 if i=1, and + # exp(y_i) if 1 Date: Fri, 20 Apr 2018 09:20:36 -0700 Subject: [PATCH 0516/1734] [TF:XLA] Bump open source llvm revision to r330313 PiperOrigin-RevId: 193678317 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index d7bd2a2be0c..aeaf8d7a241 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "llvm", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/3210e64b499a31193051208f2f8922dadfc4bb6f.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz", ], - sha256 = "017d7db029cc175634d75416c326770139c76590575ed44a3794c11ab160c955", - strip_prefix = "llvm-3210e64b499a31193051208f2f8922dadfc4bb6f", + sha256 = "92b7c01074f694a77b4d664951d1ec071e30ef19c61e673158e95fbb6e447b54", + strip_prefix = "llvm-c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32", build_file = clean_dep("//third_party/llvm:llvm.BUILD"), ) From d0e3e998376f5e7d59678e5d42f3497e52ca7622 Mon Sep 17 00:00:00 2001 From: Saurabh Saxena Date: Fri, 20 Apr 2018 09:23:52 -0700 Subject: [PATCH 0517/1734] Fix msan error in MapAndBatchDataset. While checkpointing tensors in BatchResult.output save only the initialized slice. If the final batch is short, the entire batch tensor may not be initialized. PiperOrigin-RevId: 193678679 --- .../kernels/data/map_and_batch_dataset_op.cc | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index b8105552a0e..605ef3c0b79 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -331,7 +331,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { } CHECK_EQ(batch_results_.size(), batch_results_size); for (size_t i = 0; i < batch_results_size; ++i) { - TF_RETURN_IF_ERROR(ReadBatchResultLocked(reader, i)); + TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i)); } return Status::OK(); } @@ -573,7 +573,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { // finish. This may delay saving a checkpoint by a bit but keeps the // code clean and also saves us from checkpointing the state of the // `BlockingCounter`. - batch_results_[index].counter->Wait(); + int64 num_elements = 0; + WaitForBatch(index, &num_elements).IgnoreError(); + const BatchResult& result = batch_results_[index]; string prefix = strings::StrCat("batch_results_", index); { @@ -587,14 +589,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { full_name(strings::StrCat(prefix, "_output_size")), result.output.size())); for (size_t i = 0; i < result.output.size(); i++) { - TF_RETURN_IF_ERROR(writer->WriteTensor( - full_name(strings::StrCat(prefix, "_output_", i)), - result.output[i])); + // If the batch is not full, we only store the first + // `num_elements` values. The rest of the batch tensor is + // *uninitialized* and accessing that will raise msan errors. + if (num_elements < dataset()->batch_size_) { + TF_RETURN_IF_ERROR(writer->WriteTensor( + full_name(strings::StrCat(prefix, "_output_", i)), + result.output[i].Slice(0, num_elements))); + } else { + TF_RETURN_IF_ERROR(writer->WriteTensor( + full_name(strings::StrCat(prefix, "_output_", i)), + result.output[i])); + } } return Status::OK(); } - Status ReadBatchResultLocked(IteratorStateReader* reader, size_t index) + Status ReadBatchResultLocked(IteratorContext* ctx, + IteratorStateReader* reader, size_t index) EXCLUSIVE_LOCKS_REQUIRED(mu_) { BatchResult* result = &batch_results_[index]; string prefix = strings::StrCat("batch_results_", index); @@ -618,10 +630,24 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { } result->output.reserve(output_size); for (size_t i = 0; i < output_size; i++) { - result->output.emplace_back(); + Tensor t; TF_RETURN_IF_ERROR(reader->ReadTensor( - full_name(strings::StrCat(prefix, "_output_", i)), - &result->output.back())); + full_name(strings::StrCat(prefix, "_output_", i)), &t)); + // If the batch was not full, we may have stored only the relevant + // slice. Since tensors in `BatchResult.output` are expected to + // have the leading dimension of size batch_size, we build a larger + // tensor and copy the slice read from the checkpoint into it. + if (t.dim_size(0) < dataset()->batch_size_) { + TensorShape component_shape(t.shape()); + component_shape.set_dim(0, dataset()->batch_size_); + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape); + TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0))); + result->output.emplace_back(std::move(new_t)); + } else { + result->output.emplace_back(std::move(t)); + } } return Status::OK(); } From cd462f39e58674a43d1f8c156f23235722b2281e Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 20 Apr 2018 09:31:08 -0700 Subject: [PATCH 0518/1734] Don't delete inbound_nodes and outbound_nodes, these no longer exist. PiperOrigin-RevId: 193679512 --- tensorflow/tools/docs/generate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py index c750539a76a..fc93085e3e0 100644 --- a/tensorflow/tools/docs/generate.py +++ b/tensorflow/tools/docs/generate.py @@ -43,10 +43,6 @@ if __name__ == '__main__': flags = doc_generator.parse_known_args() - # Suppress documentation of some symbols that users should never use. - del tf.layers.Layer.inbound_nodes - del tf.layers.Layer.outbound_nodes - # tf_debug is not imported with tf, it's a separate module altogether doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)]) From fb23c0e166179ccf372203982d8fe79de441e360 Mon Sep 17 00:00:00 2001 From: James Keeling Date: Fri, 20 Apr 2018 09:54:50 -0700 Subject: [PATCH 0519/1734] Correct error in "Adding An Op" docs. The macro `REGISTER_KERNEL_BUILDER` always declared a functor specialized on floats, instead of the type actually passed into the macro. PiperOrigin-RevId: 193682519 --- tensorflow/docs_src/extend/adding_an_op.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md index 84da2165b59..c3795492cef 100644 --- a/tensorflow/docs_src/extend/adding_an_op.md +++ b/tensorflow/docs_src/extend/adding_an_op.md @@ -267,7 +267,7 @@ REGISTER_CPU(int32); #ifdef GOOGLE_CUDA #define REGISTER_GPU(T) \ /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - extern template ExampleFunctor; \ + extern template ExampleFunctor; \ REGISTER_KERNEL_BUILDER( \ Name("Example").Device(DEVICE_GPU).TypeConstraint("T"), \ ExampleOp); From a749a6b95932d6f7438a01a2f5fd661343ad536f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 10:16:03 -0700 Subject: [PATCH 0520/1734] Change the TF record reader to use 16MB buffering by default in order to improve performance. PiperOrigin-RevId: 193685521 --- tensorflow/python/lib/io/py_record_reader.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/lib/io/py_record_reader.cc b/tensorflow/python/lib/io/py_record_reader.cc index 5fcb51b3b25..9500fc6a7c4 100644 --- a/tensorflow/python/lib/io/py_record_reader.cc +++ b/tensorflow/python/lib/io/py_record_reader.cc @@ -43,9 +43,10 @@ PyRecordReader* PyRecordReader::New(const string& filename, uint64 start_offset, reader->offset_ = start_offset; reader->file_ = file.release(); + static const uint64 kReaderBufferSize = 16 * 1024 * 1024; RecordReaderOptions options = RecordReaderOptions::CreateRecordReaderOptions(compression_type_string); - + options.buffer_size = kReaderBufferSize; reader->reader_ = new RecordReader(reader->file_, options); return reader; } From 729192823935156ae29d7f0d5f64c0bcd6034c7a Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Fri, 20 Apr 2018 10:32:24 -0700 Subject: [PATCH 0521/1734] Adding Shape inference functions to outfeed enqueue ops. PiperOrigin-RevId: 193688099 --- tensorflow/contrib/tpu/ops/outfeed_ops.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc index 5900c61a387..b05c76ca64f 100644 --- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc +++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc @@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue") .Input("input: dtype") .Attr("dtype: type") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits a single Tensor value from an XLA computation. @@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple") .Input("inputs: dtypes") .Attr("dtypes: list(type)") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits multiple Tensor values from an XLA computation. From da5a6d86b856001c03cccace5ac74fa8f045b6ae Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 10:34:49 -0700 Subject: [PATCH 0522/1734] Disable constant folding and arithmetic optimizations for functions. PiperOrigin-RevId: 193688466 --- tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 22799311bcd..cdc4698c345 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -243,6 +243,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, std::unordered_set optimized_funcs; bool optimize_function_library = true; + // TODO(ezhulenev): turn it on after fixing ranklab: tune_tf_test. + cfg_.set_constant_folding(RewriterConfig::OFF); + cfg_.set_arithmetic_optimization(RewriterConfig::OFF); + while (optimize_function_library) { optimize_function_library = false; From a09c02a3ecc190da8fbae88bdc54505de5387645 Mon Sep 17 00:00:00 2001 From: Junpeng Lao Date: Fri, 20 Apr 2018 20:06:02 +0200 Subject: [PATCH 0523/1734] minor code styling --- .../contrib/distributions/python/ops/bijectors/ordered.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py index 46fec0562c9..a180f1df0c5 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py @@ -46,7 +46,7 @@ class Ordered(bijector.Bijector): `y[0] = x[0]` `y[1:] = math_ops.log(x[1:] - x[:-1])` - Example Use: + #### Example Use: ```python bijector.Ordered().forward([2, 3, 4]) From b3f379e907259aa166c1ef734ccfd03331eb0a94 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Fri, 20 Apr 2018 11:10:56 -0700 Subject: [PATCH 0524/1734] [XLA:CPU] Use Eigen for F64 dot operations PiperOrigin-RevId: 193694613 --- tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc | 3 ++- tensorflow/compiler/xla/service/cpu/ir_emitter.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index 29afd8ea5f9..495fecc4aa8 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -1070,7 +1070,8 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape, // 1) be matrices with no padding, and // 2) have an allowed element type. PrimitiveType output_primitive_type = output_shape.element_type(); - return (output_primitive_type == F32 || output_primitive_type == F16) && + return (output_primitive_type == F64 || output_primitive_type == F32 || + output_primitive_type == F16) && IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) && IsRank2WithNoPadding(output_shape); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 3405277d449..f990ee27852 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -2076,7 +2076,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( /*instruction=*/*root, /*operands=*/{lhs, rhs}, - /*supported_types=*/{F16, F32})); + /*supported_types=*/{F16, F32, F64})); llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs)); llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs)); From 49f3469d9533cb12d06ed3907b4ced975e2fcea4 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 20 Apr 2018 11:13:16 -0700 Subject: [PATCH 0525/1734] Use CreateWorkerSession and DeleteWorkerSession for all distributed sessions. This change adds a phase to the session creation protocol: the master now contacts all workers to register a session handle and create a "WorkerSession" on each worker before it first registers or runs a graph on any worker. Subsequent requests to a worker ensure that the worker has the session handle registered before performing the request, and an AbortedError is raised if the worker has not (e.g. because it restarted after a failure). As a result, more failure cases are covered by the high-level APIs (tf.estimator, Slim, etc.) that recreate the session on receiving an AbortedError. Previously, there was a possible race condition in which a PS task could restart between variable initialization and the first step, leading to a FailedPreconditionError ("Attempting to use uninitialized value") that would not be handled by the high-level APIs. PiperOrigin-RevId: 193694958 --- .../core/distributed_runtime/master_session.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index ebe350d313d..1c67b42e761 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -89,6 +89,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted { ~ReffedClientGraph() override { if (should_deregister_) { DeregisterPartitions(); + } else { + for (Part& part : partitions_) { + worker_cache_->ReleaseWorker(part.name, part.worker); + } } } @@ -1174,14 +1178,8 @@ Status MasterSession::Create(GraphDef* graph_def, TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph( graph_def, execution_options, &execution_state_)); } - // TODO(b/36574172): Remove these conditions when ClusterSpec - // propagation is supported in all servers. - if (options.cluster_def != nullptr || - session_opts_.config.isolate_session_state()) { - should_delete_worker_sessions_ = true; - return CreateWorkerSessions(options); - } - return Status::OK(); + should_delete_worker_sessions_ = true; + return CreateWorkerSessions(options); } Status MasterSession::CreateWorkerSessions( From 570d90b9c7e6a19bc2606fdaf7ad0f85b8590c0e Mon Sep 17 00:00:00 2001 From: akindyakov Date: Fri, 20 Apr 2018 11:23:15 -0700 Subject: [PATCH 0526/1734] Speed up safe_strtod and safe_strtof functions by using double-conversion library Closes #12102. PiperOrigin-RevId: 193696537 --- tensorflow/contrib/cmake/CMakeLists.txt | 4 + .../cmake/external/double_conversion.cmake | 54 ++++++++++++ tensorflow/contrib/makefile/Makefile | 8 +- .../contrib/makefile/download_dependencies.sh | 4 +- tensorflow/core/BUILD | 9 +- tensorflow/core/lib/strings/numbers.cc | 51 +++++++---- tensorflow/core/lib/strings/numbers.h | 2 + tensorflow/core/lib/strings/numbers_test.cc | 87 +++++++++++++++++++ tensorflow/core/lib/strings/str_util.cc | 8 ++ tensorflow/core/lib/strings/str_util.h | 5 ++ tensorflow/core/lib/strings/str_util_test.cc | 56 ++---------- tensorflow/tools/lib_package/BUILD | 2 + tensorflow/tools/pip_package/BUILD | 1 + tensorflow/workspace.bzl | 10 +++ third_party/double_conversion.BUILD | 38 ++++++++ 15 files changed, 270 insertions(+), 69 deletions(-) create mode 100644 tensorflow/contrib/cmake/external/double_conversion.cmake create mode 100644 third_party/double_conversion.BUILD diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 23b31ae1dcc..bdf3e986351 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -193,6 +193,7 @@ include(protobuf) include(re2) include(cub) include(sqlite) +include(double_conversion) if (tensorflow_BUILD_CC_TESTS) include(googletest) endif() @@ -213,6 +214,7 @@ set(tensorflow_EXTERNAL_LIBRARIES ${protobuf_STATIC_LIBRARIES} ${re2_STATIC_LIBRARIES} ${sqlite_STATIC_LIBRARIES} + ${double_conversion_STATIC_LIBRARIES} ) if (systemlib_ZLIB) @@ -240,6 +242,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES fft2d re2 sqlite_copy_headers_to_destination + double_conversion ) include_directories( @@ -262,6 +265,7 @@ include_directories( ${PROTOBUF_INCLUDE_DIRS} ${re2_INCLUDE_DIR} ${sqlite_INCLUDE_DIR} + ${double_conversion_INCLUDE_DIR} ) if(tensorflow_ENABLE_SSL_SUPPORT) diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake new file mode 100644 index 00000000000..527ccdc8d88 --- /dev/null +++ b/tensorflow/contrib/cmake/external/double_conversion.cmake @@ -0,0 +1,54 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion) +set(double_conversion_URL https://github.com/google/double-conversion.git) +set(double_conversion_TAG 5664746) +set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR}) +set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so) +set(double_conversion_INCLUDES ${double_conversion_BUILD}) + +if(WIN32) + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib) +else() + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a) +endif() + +set(double_conversion_HEADERS + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h" +) + +ExternalProject_Add(double_conversion + PREFIX double_conversion + GIT_REPOSITORY ${double_conversion_URL} + GIT_TAG ${double_conversion_TAG} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "" + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON +) diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 05e8d9064be..1a1ab54a53d 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -89,6 +89,7 @@ HOST_INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(HOST_GENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include @@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text # The list of dependencies is derived from the Bazel build file by running # the gen_file_lists.sh script on a system with a working Bazel setup. PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt) -PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) +PROTO_TEXT_PB_CC_LIST := \ + $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \ + $(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt) # Locations of the intermediate files proto_text generates. @@ -171,6 +174,7 @@ INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) @@ -326,6 +330,7 @@ $(MARCH_OPTION) \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) @@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \ $(wildcard tensorflow/core/platform/*/*/*.cc) \ $(wildcard tensorflow/core/util/*.cc) \ $(wildcard tensorflow/core/util/*/*.cc) \ +$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \ tensorflow/core/util/version_info.cc # Remove duplicates (for version_info.cc) CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS)) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 8b415e6527f..48953e2e384 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -32,7 +32,8 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" -FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" +FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)" +DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" @@ -87,6 +88,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync" download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf" download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2" download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d" +download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion" download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl" download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive" diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index c15e7de186f..5b04574a4fa 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -337,7 +337,9 @@ cc_library( "lib/bfloat16/bfloat16.h", ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()), copts = tf_copts(), - deps = tf_lib_proto_parsing_deps(), + deps = tf_lib_proto_parsing_deps() + [ + "@double_conversion//:double-conversion", + ], ) # This build rule (along with :lib_internal, :framework, and @@ -1231,6 +1233,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1270,6 +1273,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1333,6 +1337,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1355,6 +1360,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1751,6 +1757,7 @@ cc_library( "//tensorflow/core/platform/default/build_config:platformlib", "@snappy", "@zlib_archive//:zlib", + "@double_conversion//:double-conversion", "@protobuf_archive//:protobuf", ] + tf_protos_all_impl() + tf_protos_grappler_impl(), ) diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc index c296daa95d6..e4b909296e8 100644 --- a/tensorflow/core/lib/strings/numbers.cc +++ b/tensorflow/core/lib/strings/numbers.cc @@ -23,6 +23,8 @@ limitations under the License. #include #include +#include "double-conversion/double-conversion.h" + #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" @@ -110,6 +112,17 @@ T locale_independent_strtonum(const char* str, const char** endptr) { return result; } +static inline const double_conversion::StringToDoubleConverter& +StringToFloatConverter() { + static const double_conversion::StringToDoubleConverter converter( + double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES | + double_conversion::StringToDoubleConverter::ALLOW_HEX | + double_conversion::StringToDoubleConverter::ALLOW_TRAILING_SPACES | + double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY, + 0., 0., "inf", "nan"); + return converter; +} + } // namespace namespace strings { @@ -319,25 +332,31 @@ bool safe_strtou32(StringPiece str, uint32* value) { } bool safe_strtof(const char* str, float* value) { - const char* endptr; - *value = locale_independent_strtonum(str, &endptr); - while (isspace(*endptr)) ++endptr; - // Ignore range errors from strtod/strtof. - // The values it returns on underflow and - // overflow are the right fallback in a - // robust setting. - return *str != '\0' && *endptr == '\0'; + int processed_characters_count = -1; + auto len = str_util::Strnlen(str, kFastToBufferSize); + + // If there is no zero-termination in str, fail. + if (len == kFastToBufferSize) return false; + // If string length exceeds int max, fail. + if (len > std::numeric_limits::max()) return false; + + *value = StringToFloatConverter().StringToFloat(str, static_cast(len), + &processed_characters_count); + return processed_characters_count > 0; } bool safe_strtod(const char* str, double* value) { - const char* endptr; - *value = locale_independent_strtonum(str, &endptr); - while (isspace(*endptr)) ++endptr; - // Ignore range errors from strtod/strtof. - // The values it returns on underflow and - // overflow are the right fallback in a - // robust setting. - return *str != '\0' && *endptr == '\0'; + int processed_characters_count = -1; + auto len = str_util::Strnlen(str, kFastToBufferSize); + + // If there is no zero-termination in str, fail. + if (len == kFastToBufferSize) return false; + // If string length exceeds int max, fail. + if (len > std::numeric_limits::max()) return false; + + *value = StringToFloatConverter().StringToDouble(str, static_cast(len), + &processed_characters_count); + return processed_characters_count > 0; } size_t FloatToBuffer(float value, char* buffer) { diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h index 6b7703be378..e9add428492 100644 --- a/tensorflow/core/lib/strings/numbers.h +++ b/tensorflow/core/lib/strings/numbers.h @@ -114,11 +114,13 @@ bool safe_strtou64(StringPiece str, uint64* value); // Convert strings to floating point values. // Leading and trailing spaces are allowed. // Values may be rounded on over- and underflow. +// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`. bool safe_strtof(const char* str, float* value); // Convert strings to double precision floating point values. // Leading and trailing spaces are allowed. // Values may be rounded on over- and underflow. +// Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`. bool safe_strtod(const char* str, double* value); inline bool ProtoParseNumeric(StringPiece s, int32* value) { diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc index e15161de66c..0f22dac262b 100644 --- a/tensorflow/core/lib/strings/numbers_test.cc +++ b/tensorflow/core/lib/strings/numbers_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" +#include #include #include "tensorflow/core/platform/test.h" @@ -277,7 +278,49 @@ TEST(safe_strtof, Float) { EXPECT_TRUE(safe_strtof("-0x2A", &result)); EXPECT_EQ(-42.0f, result); + EXPECT_TRUE(safe_strtof(" -0x2", &result)); + EXPECT_EQ(-2.0f, result); + + EXPECT_TRUE(safe_strtof("8 \t", &result)); + EXPECT_EQ(8.0f, result); + + EXPECT_TRUE(safe_strtof("\t20.0\t ", &result)); + EXPECT_EQ(20.0f, result); + EXPECT_FALSE(safe_strtof("-infinity is awesome", &result)); + + // Make sure we exit cleanly if the string is not terminated + char test_str[2 * kFastToBufferSize]; + for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a'; + EXPECT_FALSE(safe_strtof(test_str, &result)); + + // Make sure we exit cleanly if the string is too long + test_str[kFastToBufferSize + 1] = '\0'; + EXPECT_FALSE(safe_strtof(test_str, &result)); + + EXPECT_TRUE(safe_strtof("-inf", &result)); + EXPECT_EQ(-std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtof("+inf", &result)); + EXPECT_EQ(std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtof("InF", &result)); + EXPECT_EQ(std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtof("-INF", &result)); + EXPECT_EQ(-std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtof("nan", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtof("-nan", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtof("-NaN", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtof("+NAN", &result)); + EXPECT_TRUE(std::isnan(result)); } TEST(safe_strtod, Double) { @@ -287,6 +330,15 @@ TEST(safe_strtod, Double) { EXPECT_EQ(0.1234567890123, result); EXPECT_FALSE(safe_strtod("0.1234567890123abc", &result)); + // Make sure we exit cleanly if the string is not terminated + char test_str[2 * kFastToBufferSize]; + for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a'; + EXPECT_FALSE(safe_strtod(test_str, &result)); + + // Make sure we exit cleanly if the string is too long + test_str[kFastToBufferSize + 1] = '\0'; + EXPECT_FALSE(safe_strtod(test_str, &result)); + // Overflow to infinity, underflow to 0. EXPECT_TRUE(safe_strtod("1e310", &result)); EXPECT_EQ(std::numeric_limits::infinity(), result); @@ -296,6 +348,41 @@ TEST(safe_strtod, Double) { EXPECT_TRUE(safe_strtod("1e-325", &result)); EXPECT_EQ(0, result); + + EXPECT_TRUE(safe_strtod(" -0x1c", &result)); + EXPECT_EQ(-28.0, result); + + EXPECT_TRUE(safe_strtod("50 \t", &result)); + EXPECT_EQ(50.0, result); + + EXPECT_TRUE(safe_strtod("\t82.0\t ", &result)); + EXPECT_EQ(82.0, result); + + EXPECT_FALSE(safe_strtod("infinity", &result)); + + EXPECT_TRUE(safe_strtod("-inf", &result)); + EXPECT_EQ(-std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtod("+inf", &result)); + EXPECT_EQ(std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtod("InF", &result)); + EXPECT_EQ(std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtod("-INF", &result)); + EXPECT_EQ(-std::numeric_limits::infinity(), result); + + EXPECT_TRUE(safe_strtod("nan", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtod("-nan", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtod("-NaN", &result)); + EXPECT_TRUE(std::isnan(result)); + + EXPECT_TRUE(safe_strtod("+NAN", &result)); + EXPECT_TRUE(std::isnan(result)); } } // namespace strings diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc index 2c9e98357a1..4598b8ccc79 100644 --- a/tensorflow/core/lib/strings/str_util.cc +++ b/tensorflow/core/lib/strings/str_util.cc @@ -454,6 +454,14 @@ bool SplitAndParseAsFloats(StringPiece text, char delim, result); } +size_t Strnlen(const char* str, const size_t string_max_len) { + size_t len = 0; + while (len < string_max_len && str[len] != '\0') { + ++len; + } + return len; +} + bool StrContains(StringPiece haystack, StringPiece needle) { return std::search(haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end(); diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h index 065871c1b4b..e97d00b975e 100644 --- a/tensorflow/core/lib/strings/str_util.h +++ b/tensorflow/core/lib/strings/str_util.h @@ -223,6 +223,11 @@ std::vector Split(StringPiece text, char delims, Predicate p) { return Split(text, StringPiece(&delims, 1), p); } +// Returns the length of the given null-terminated byte string 'str'. +// Returns 'string_max_len' if the null character was not found in the first +// 'string_max_len' bytes of 'str'. +size_t Strnlen(const char* str, const size_t string_max_len); + } // namespace str_util } // namespace tensorflow diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc index 63643c3e8ed..3bf3e99825f 100644 --- a/tensorflow/core/lib/strings/str_util_test.cc +++ b/tensorflow/core/lib/strings/str_util_test.cc @@ -430,56 +430,12 @@ TEST(StringReplace, EmptyStringReplaceAll) { EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true)); } -TEST(StartsWith, Basic) { - const string s1( - "123" - "\0" - "456", - 7); - const StringPiece a("foobar"); - const StringPiece b(s1); - const StringPiece e; - EXPECT_TRUE(str_util::StartsWith(a, a)); - EXPECT_TRUE(str_util::StartsWith(a, "foo")); - EXPECT_TRUE(str_util::StartsWith(a, e)); - EXPECT_TRUE(str_util::StartsWith(b, s1)); - EXPECT_TRUE(str_util::StartsWith(b, b)); - EXPECT_TRUE(str_util::StartsWith(b, e)); - EXPECT_TRUE(str_util::StartsWith(e, "")); - EXPECT_FALSE(str_util::StartsWith(a, b)); - EXPECT_FALSE(str_util::StartsWith(b, a)); - EXPECT_FALSE(str_util::StartsWith(e, a)); -} - -TEST(EndsWith, Basic) { - const string s1( - "123" - "\0" - "456", - 7); - const StringPiece a("foobar"); - const StringPiece b(s1); - const StringPiece e; - EXPECT_TRUE(str_util::EndsWith(a, a)); - EXPECT_TRUE(str_util::EndsWith(a, "bar")); - EXPECT_TRUE(str_util::EndsWith(a, e)); - EXPECT_TRUE(str_util::EndsWith(b, s1)); - EXPECT_TRUE(str_util::EndsWith(b, b)); - EXPECT_TRUE(str_util::EndsWith(b, e)); - EXPECT_TRUE(str_util::EndsWith(e, "")); - EXPECT_FALSE(str_util::EndsWith(a, b)); - EXPECT_FALSE(str_util::EndsWith(b, a)); - EXPECT_FALSE(str_util::EndsWith(e, a)); -} - -TEST(StrContains, Basic) { - StringPiece a("abcdefg"); - StringPiece b("abcd"); - StringPiece c("efg"); - StringPiece d("gh"); - EXPECT_TRUE(str_util::StrContains(a, b)); - EXPECT_TRUE(str_util::StrContains(a, c)); - EXPECT_TRUE(!str_util::StrContains(a, d)); +TEST(Strnlen, Basic) { + EXPECT_EQ(0, str_util::Strnlen("ab", 0)); + EXPECT_EQ(1, str_util::Strnlen("a", 1)); + EXPECT_EQ(2, str_util::Strnlen("abcd", 2)); + EXPECT_EQ(3, str_util::Strnlen("abc", 10)); + EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10)); } } // namespace tensorflow diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index 0ede8c63704..569b6678cab 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -118,6 +118,7 @@ genrule( "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", "@curl//:COPYING", + "@double_conversion//:LICENSE", "@eigen_archive//:COPYING.MPL2", "@farmhash_archive//:COPYING", "@fft2d//:fft/readme.txt", @@ -155,6 +156,7 @@ genrule( "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", "@curl//:COPYING", + "@double_conversion//:LICENSE", "@eigen_archive//:COPYING.MPL2", "@farmhash_archive//:COPYING", "@fft2d//:fft/readme.txt", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 0ac5a5bb6dd..7b508f87ab7 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -128,6 +128,7 @@ filegroup( "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", "@curl//:COPYING", + "@double_conversion//:LICENSE", "@eigen_archive//:COPYING.MPL2", "@farmhash_archive//:COPYING", "@fft2d//:fft/readme.txt", diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index aeaf8d7a241..bbef4b9e5f9 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -693,6 +693,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""): build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"), ) + native.new_http_archive( + name = "double_conversion", + urls = [ + "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip", + ], + sha256 = "2f7fbffac0d98d201ad0586f686034371a6d152ca67508ab611adc2386ad30de", + strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8", + build_file = clean_dep("//third_party:double_conversion.BUILD") + ) + tf_http_archive( name = "tflite_mobilenet", sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b", diff --git a/third_party/double_conversion.BUILD b/third_party/double_conversion.BUILD new file mode 100644 index 00000000000..9f905216c03 --- /dev/null +++ b/third_party/double_conversion.BUILD @@ -0,0 +1,38 @@ +# Bazel(http://bazel.io) BUILD file + +licenses(["notice"]) + +exports_files(["LICENSE"]) + +cc_library( + name = "double-conversion", + srcs = [ + "double-conversion/bignum.cc", + "double-conversion/bignum-dtoa.cc", + "double-conversion/cached-powers.cc", + "double-conversion/diy-fp.cc", + "double-conversion/double-conversion.cc", + "double-conversion/fast-dtoa.cc", + "double-conversion/fixed-dtoa.cc", + "double-conversion/strtod.cc", + "double-conversion/utils.h", + ], + hdrs = [ + "double-conversion/bignum.h", + "double-conversion/bignum-dtoa.h", + "double-conversion/cached-powers.h", + "double-conversion/diy-fp.h", + "double-conversion/double-conversion.h", + "double-conversion/fast-dtoa.h", + "double-conversion/fixed-dtoa.h", + "double-conversion/ieee.h", + "double-conversion/strtod.h", + ], + includes = [ + ".", + ], + linkopts = [ + "-lm", + ], + visibility = ["//visibility:public"], +) From 5fbb1feecd77a70b32d333b56bd13b1798b9a766 Mon Sep 17 00:00:00 2001 From: James Qin Date: Fri, 20 Apr 2018 11:23:29 -0700 Subject: [PATCH 0527/1734] Temporarily set cudnn Rnn math precision to fp32. Problem: When calling cudnnGetRNNLinLayerMatrixParams(), return error CUDNN_STATUS_BAD_PARAM if: * RNN descriptor set math precision = CUDNN_DATA_FLOAT * input descriptor dataType = CUDNN_DATA_HALF * weight descriptor dataType= CUDNN_DATA_HALF If updating Rnn descriptor math precision to CUDNN_DATA_HALF, then no error. cudnn 7.1.4 will fix the problem. PiperOrigin-RevId: 193696566 --- tensorflow/stream_executor/cuda/cuda_dnn.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index d673e19007d..640f270323c 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -2529,12 +2529,20 @@ cudnnDataType_t GetConvComputeType() { } // A helper struct to decide whether to use FP32 as the internal compute type -// for rnn when the input data type is FP16. By default it is turned on, -// users can explicitly disable them (choose to use FP16 as the internal compute -// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0". +// for rnn when the input data type is FP16. At present it is turned off, +// users can explicitly control them through an env-var +// TF_FP16_RNN_USE_FP32_COMPUTE. +// After the TODO below is fixed, users should almost always use fp32 compute +// type for training. Using fp16 might suffer suboptimal accuracy due to loss +// in precision. struct RnnDoFP32ComputationFP16Input { static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE"; - static constexpr bool kDefaultFlag = true; + // TODO(jamesqin): b/78182362 flip to true when cudnn 7.1.4 fixes the bug. + // Before cudnn 7.1.4 RNN are always done in fp32, no matter what math + // precision is set. + // Set it temporary to false s.t. no error is raised when using fp16 inputs, + // fp32 math precision. + static constexpr bool kDefaultFlag = false; }; // A helper function to return the internal compute type for From 712bbc5d7babd523951445f361f0e339061cd259 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 20 Apr 2018 11:24:53 -0700 Subject: [PATCH 0528/1734] Allow creating tensors from numpy arrays, and other various constants - try #2 Allow type-inference from a different input tensor, similar to args_to_matching_eager. - Update TFE_Py_TensorShapeSlice to take tuples. - Update int values to allow int/long in py2 END_PUBLIC BEGIN_PUBLIC Automated g4 rollback of changelist 192184809 PiperOrigin-RevId: 193696790 --- tensorflow/python/eager/pywrap_tensor.cc | 201 ++++++++-------- tensorflow/python/eager/pywrap_tensor.h | 10 + tensorflow/python/eager/pywrap_tfe.h | 12 +- tensorflow/python/eager/pywrap_tfe_src.cc | 278 +++++++++++++++++++--- tensorflow/python/eager/tensor_test.py | 7 +- tensorflow/python/framework/ops.py | 16 ++ 6 files changed, 389 insertions(+), 135 deletions(-) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index 519814b979e..b5b4e394e33 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -60,42 +60,6 @@ TFE_TensorHandle* NumpyToTensorHandle(PyObject* obj) { } } -// Casts data referred to by `handle` from type `src_type_enum` to type -// `dst_type_enum`. -TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle, - TF_DataType src_type_enum, - TF_DataType dst_type_enum, TF_Status* out_status) { - if (ctx == nullptr) return nullptr; - const char* op_name = "Cast"; - const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0"; - TFE_Op* op = TFE_NewOp(ctx, op_name, out_status); -#define RETURN_ERROR \ - { \ - TFE_DeleteOp(op); \ - return nullptr; \ - } - if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR - TFE_OpSetDevice(op, device_name, out_status); - if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR - TFE_OpAddInput(op, handle, out_status); - if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR - TFE_OpSetAttrType(op, "SrcT", src_type_enum); - TFE_OpSetAttrType(op, "DstT", dst_type_enum); - TFE_TensorHandle* output = nullptr; - int num_outputs = 1; - TFE_Execute(op, &output, &num_outputs, out_status); - if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 || - output == nullptr) { - if (output != nullptr) { - TFE_DeleteTensorHandle(output); - } - RETURN_ERROR - } - TFE_DeleteOp(op); - return output; -#undef RETURN_ERROR -} - TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx, PyObject* dev) { const char* device = ""; @@ -161,6 +125,100 @@ PyObject* PyIntFromDataType(TF_DataType l) { } // namespace +namespace tensorflow { +// Casts data referred to by `handle` from type `src_type_enum` to type +// `dst_type_enum`. +TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle, + TF_DataType src_type_enum, + TF_DataType dst_type_enum, TF_Status* out_status) { + if (ctx == nullptr) return nullptr; + const char* op_name = "Cast"; + const char* device_name = "/job:localhost/replica:0/task:0/device:CPU:0"; + TFE_Op* op = TFE_NewOp(ctx, op_name, out_status); +#define RETURN_ERROR \ + { \ + TFE_DeleteOp(op); \ + return nullptr; \ + } + if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR + TFE_OpSetDevice(op, device_name, out_status); + if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR + TFE_OpAddInput(op, handle, out_status); + if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR + TFE_OpSetAttrType(op, "SrcT", src_type_enum); + TFE_OpSetAttrType(op, "DstT", dst_type_enum); + TFE_TensorHandle* output = nullptr; + int num_outputs = 1; + TFE_Execute(op, &output, &num_outputs, out_status); + if (TF_GetCode(out_status) != TF_OK || num_outputs != 1 || + output == nullptr) { + if (output != nullptr) { + TFE_DeleteTensorHandle(output); + } + RETURN_ERROR + } + TFE_DeleteOp(op); + return output; +#undef RETURN_ERROR +} + +TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) { + int desired_dtype = -1; + if (dtype != Py_None) { + if (!PyIntToDataType(dtype, &desired_dtype)) { + PyErr_SetString(PyExc_TypeError, + tensorflow::strings::StrCat( + "Expecting a DataType value for dtype. Got ", + Py_TYPE(dtype)->tp_name) + .c_str()); + return nullptr; + } + } + if (PyArray_Check(value)) { + int desired_np_dtype = -1; + if (desired_dtype >= 0) { + if (!tensorflow::TF_DataType_to_PyArray_TYPE( + static_cast(desired_dtype), &desired_np_dtype) + .ok()) { + PyErr_SetString(PyExc_TypeError, + tensorflow::strings::StrCat( + "Invalid dtype argument value ", desired_dtype) + .c_str()); + return nullptr; + } + } + PyArrayObject* array = reinterpret_cast(value); + int current_np_dtype = PyArray_TYPE(array); + auto safe_value = tensorflow::make_safe(static_cast(nullptr)); + if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) || + !PyArray_ISCARRAY(array)) { + int new_dtype = + desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype; + safe_value = tensorflow::make_safe( + PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0, + NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr)); + if (PyErr_Occurred()) return nullptr; + if (safe_value == nullptr) { + PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value"); + return nullptr; + } + value = safe_value.get(); + } + return NumpyToTensorHandle(value); + } else { + tensorflow::Tensor t; + // TODO(josh11b): Have PySeqToTensor set python errors instead of + // returning Status. + auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t); + if (!cppstatus.ok()) { + PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str()); + return nullptr; + } + return TFE_NewTensorHandle(t); + } +} +} // namespace tensorflow + extern "C" { static const int kMaxEagerTensorParentSize = 64; @@ -230,61 +288,16 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { return -1; } } - tensorflow::Safe_TFE_TensorHandlePtr handle = - tensorflow::make_safe(static_cast(nullptr)); PyErr_Clear(); - if (PyArray_Check(value)) { - int desired_np_dtype = -1; - if (desired_dtype >= 0) { - if (!tensorflow::TF_DataType_to_PyArray_TYPE( - static_cast(desired_dtype), &desired_np_dtype) - .ok()) { - PyErr_SetString(PyExc_TypeError, - tensorflow::strings::StrCat( - "Invalid dtype argument value ", desired_dtype) - .c_str()); - return -1; - } - } - PyArrayObject* array = reinterpret_cast(value); - int current_np_dtype = PyArray_TYPE(array); - auto safe_value = tensorflow::make_safe(static_cast(nullptr)); - if ((desired_np_dtype >= 0 && desired_np_dtype != current_np_dtype) || - !PyArray_ISCARRAY(array)) { - int new_dtype = - desired_np_dtype >= 0 ? desired_np_dtype : current_np_dtype; - safe_value = tensorflow::make_safe( - PyArray_FromAny(value, PyArray_DescrFromType(new_dtype), 0, 0, - NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST, nullptr)); - if (PyErr_Occurred()) return -1; - if (safe_value == nullptr) { - PyErr_SetString(PyExc_ValueError, "Error while casting a numpy value"); - return -1; - } - value = safe_value.get(); - } - handle = tensorflow::make_safe(NumpyToTensorHandle(value)); - } else { - tensorflow::Tensor t; - // TODO(josh11b): Have PySeqToTensor set python errors instead of - // returning Status. - auto cppstatus = tensorflow::PySeqToTensor(value, dtype, &t); - if (!cppstatus.ok()) { - PyErr_SetString(PyExc_ValueError, cppstatus.error_message().c_str()); - return -1; - } - handle = tensorflow::make_safe(TFE_NewTensorHandle(t)); - } - if (PyErr_Occurred()) return -1; - if (handle == nullptr) { - PyErr_SetString(PyExc_ValueError, "Error while creating an EagerTensor"); - return -1; - } + tensorflow::Safe_TFE_TensorHandlePtr handle = + tensorflow::make_safe(static_cast( + tensorflow::ConvertToEagerTensor(value, dtype))); + if (handle == nullptr) return -1; TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get()); if (desired_dtype >= 0 && desired_dtype != handle_dtype) { - handle = tensorflow::make_safe( - EagerCast(GetContext(context), handle.get(), handle_dtype, - static_cast(desired_dtype), self->status)); + handle = tensorflow::make_safe(tensorflow::EagerCast( + GetContext(context), handle.get(), handle_dtype, + static_cast(desired_dtype), self->status)); if (TF_GetCode(self->status) != TF_OK) { PyErr_SetString(PyExc_ValueError, tensorflow::strings::StrCat( @@ -701,12 +714,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) { return reinterpret_cast(EagerTensorType); } -PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) { - if (!PyList_Check(tensor_list)) { +PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) { + if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) { PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat( - "tensor_list argument must be a list. Got \"", - Py_TYPE(tensor_list)->tp_name, "\"") + "tensors argument must be a list or a tuple. Got \"", + Py_TYPE(tensors)->tp_name, "\"") .c_str()); return nullptr; } @@ -720,14 +733,14 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) { return nullptr; } - Py_ssize_t num_tensors = PyList_Size(tensor_list); + Py_ssize_t num_tensors = PySequence_Fast_GET_SIZE(tensors); int64_t num_tensors_int = static_cast(num_tensors); auto tensor = tensorflow::make_safe(TF_AllocateTensor( TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int)); int32_t* data = reinterpret_cast(TF_TensorData(tensor.get())); auto status = tensorflow::make_safe(TF_NewStatus()); for (Py_ssize_t i = 0; i < num_tensors; ++i) { - PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i); + PyObject* tensor_obj = PySequence_Fast_GET_ITEM(tensors, i); if (!EagerTensor_CheckExact(tensor_obj)) { PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat( diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h index aa1efdd1b81..63ab1ed84d5 100644 --- a/tensorflow/python/eager/pywrap_tensor.h +++ b/tensorflow/python/eager/pywrap_tensor.h @@ -22,4 +22,14 @@ limitations under the License. bool EagerTensor_CheckExact(const PyObject* o); tensorflow::int64 EagerTensor_id(const PyObject* tensor); +namespace tensorflow { +TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype); + +// TODO(nareshmodi): Move EagerCast and ReadVariableOp (which use the C API to +// execute TFE Ops) to a separate common library. +TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle, + TF_DataType src_type_enum, + TF_DataType dst_type_enum, TF_Status* out_status); +} + #endif // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_ diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h index 32d731d0f68..691b613e48b 100644 --- a/tensorflow/python/eager/pywrap_tfe.h +++ b/tensorflow/python/eager/pywrap_tfe.h @@ -186,16 +186,16 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs, // Returns the set of variables watched by the given tape. PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape); -// Returns an EagerTensor of dimension [len(`tensor_list`)] containing -// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words, +// Returns an EagerTensor of dimension [len(`tensors`)] containing +// the `slice_dim`'th dimension of each tensor in `tensors`. In other words, // TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in -// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes +// `tensors`. For example, if `tensors` contains tensors of with shapes // [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with // `slice_dim` equal to 1 will return [2, 5, 7]. // On error, returns nullptr and sets python exception. -// REQUIRES: `tensor_list` is a python list of EagerTensors +// REQUIRES: `tensors` is a python list/tuple of EagerTensors // REQUIRES: `slice_dim` is non-negative and smaller than the rank of all -// tensors in `tensor_list`. -PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim); +// tensors in `tensors`. +PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim); #endif // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_ diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc index d99bd0b0ffe..2bfa1f052cf 100644 --- a/tensorflow/python/eager/pywrap_tfe_src.cc +++ b/tensorflow/python/eager/pywrap_tfe_src.cc @@ -38,6 +38,54 @@ using tensorflow::strings::Printf; namespace { +struct InputInfo { + InputInfo(int i, bool is_list) : i(i), is_list(is_list) {} + + int i; + bool is_list = false; +}; + +using AttrToInputsMap = + tensorflow::gtl::FlatMap>; + +tensorflow::mutex all_attr_to_input_maps_lock( + tensorflow::LINKER_INITIALIZED); +tensorflow::gtl::FlatMap* GetAllAttrToInputsMaps() { + static auto* all_attr_to_input_maps = + new tensorflow::gtl::FlatMap; + return all_attr_to_input_maps; +} + +AttrToInputsMap* GetAttrToInputsMap(const tensorflow::OpDef& op_def) { + tensorflow::mutex_lock l(all_attr_to_input_maps_lock); + auto* all_attr_to_input_maps = GetAllAttrToInputsMaps(); + + auto* output = + tensorflow::gtl::FindPtrOrNull(*all_attr_to_input_maps, op_def.name()); + if (output != nullptr) { + return output; + } + + std::unique_ptr m(new AttrToInputsMap); + + // Store a list of InputIndex -> List of corresponding inputs. + for (int i = 0; i < op_def.input_arg_size(); i++) { + if (!op_def.input_arg(i).type_attr().empty()) { + auto it = m->find(op_def.input_arg(i).type_attr()); + if (it == m->end()) { + it = m->insert({op_def.input_arg(i).type_attr(), {}}).first; + } + it->second.emplace_back(i, !op_def.input_arg(i).number_attr().empty()); + } + } + + auto* retval = m.get(); + (*all_attr_to_input_maps)[op_def.name()] = m.release(); + + return retval; +} + struct FastPathOpExecInfo { TFE_Context* ctx; const char* device_name; @@ -53,6 +101,14 @@ struct FastPathOpExecInfo { // The op type name of the main op being executed. PyObject* op_name; PyObject* callbacks; + + // All the args passed into the FastPathOpExecInfo. + PyObject* args; + + // DTypes can come from another input that has the same attr. So build that + // map. + const AttrToInputsMap* attr_to_inputs_map; + tensorflow::gtl::FlatMap cached_dtypes; }; #define PARSE_VALUE(fn_name, type, check_fn, parse_fn) \ @@ -76,12 +132,29 @@ PARSE_VALUE(ParseIntValue, int, PyLong_Check, PyLong_AsLong) PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong) #else PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong) -PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong) -PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong) #endif PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble) #undef PARSE_VALUE +#if PY_MAJOR_VERSION < 3 +bool ParseInt64Value(const string& key, PyObject* py_value, TF_Status* status, + int64_t* value) { + if (PyInt_Check(py_value)) { + *value = static_cast(PyInt_AsLong(py_value)); + return true; + } else if (PyLong_Check(py_value)) { + *value = static_cast(PyLong_AsLong(py_value)); + return true; + } + TF_SetStatus( + status, TF_INVALID_ARGUMENT, + tensorflow::strings::StrCat("Expecting int or long value for attr ", key, + ", got ", py_value->ob_type->tp_name) + .c_str()); + return false; +} +#endif + Py_ssize_t TensorShapeNumDims(PyObject* value) { const auto size = PySequence_Size(value); if (size == -1) { @@ -234,7 +307,7 @@ bool SetOpAttrList( std::unique_ptr buffer(new int64_t[total_dims]); // Copy the input dims into the buffer and set dims to point to // the start of each list's dims. - std::unique_ptr dims(new const int64_t*[num_values]); + std::unique_ptr dims(new const int64_t*[num_values]); std::unique_ptr num_dims(new int[num_values]); int64_t* offset = buffer.get(); for (int i = 0; i < num_values; ++i) { @@ -296,7 +369,7 @@ void SetOpAttrListDefault( TF_Status* status) { if (type == TF_ATTR_STRING) { int num_values = attr.default_value().list().s_size(); - std::unique_ptr values(new const char*[num_values]); + std::unique_ptr values(new const char*[num_values]); (*attr_list_sizes)[key] = num_values; for (int i = 0; i < num_values; i++) { values[i] = attr.default_value().list().s(i).data(); @@ -349,7 +422,7 @@ void SetOpAttrListDefault( std::unique_ptr buffer(new int64_t[total_dims]); // Copy the input dims into the buffer and set dims to point to // the start of each list's dims. - std::unique_ptr dims(new const int64_t*[num_values]); + std::unique_ptr dims(new const int64_t*[num_values]); std::unique_ptr num_dims(new int[num_values]); int64_t* offset = buffer.get(); for (int i = 0; i < num_values; ++i) { @@ -369,7 +442,7 @@ void SetOpAttrListDefault( } else if (type == TF_ATTR_FUNC) { int num_values = attr.default_value().list().func_size(); (*attr_list_sizes)[key] = num_values; - std::unique_ptr funcs(new const TFE_Op*[num_values]); + std::unique_ptr funcs(new const TFE_Op*[num_values]); for (int i = 0; i < num_values; i++) { funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status); } @@ -1399,10 +1472,39 @@ PyObject* GetPythonObjectFromString(const char* s) { #endif } +PyObject* GetPythonObjectFromInt(int num) { +#if PY_MAJOR_VERSION >= 3 + return PyLong_FromLong(num); +#else + return PyInt_FromLong(num); +#endif +} + bool CheckResourceVariable(PyObject* item) { return PyObject_TypeCheck(item, resource_variable_type); } +bool IsNumberType(PyObject* item) { +#if PY_MAJOR_VERSION >= 3 + return PyFloat_Check(item) || PyLong_Check(item); +#else + return PyFloat_Check(item) || PyInt_Check(item) || PyLong_Check(item); +#endif +} + +bool CheckOneInput(PyObject* item) { + if (EagerTensor_CheckExact(item) || CheckResourceVariable(item) || + PyArray_Check(item) || IsNumberType(item)) { + return true; + } + + // Sequences are not properly handled. Sequences with purely python numeric + // types work, but sequences with mixes of EagerTensors and python numeric + // types don't work. + // TODO(nareshmodi): fix + return false; +} + bool CheckInputsOk(PyObject* seq, int start_index, const tensorflow::OpDef& op_def) { for (int i = 0; i < op_def.input_arg_size(); i++) { @@ -1419,8 +1521,7 @@ bool CheckInputsOk(PyObject* seq, int start_index, } for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) { PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j); - if (!EagerTensor_CheckExact(inner_item) && - !CheckResourceVariable(inner_item)) { + if (!CheckOneInput(inner_item)) { VLOG(1) << "Falling back to slow path for Op \"" << op_def.name() << "\", Input \"" << op_def.input_arg(i).name() << "\", Index " @@ -1430,7 +1531,7 @@ bool CheckInputsOk(PyObject* seq, int start_index, return false; } } - } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) { + } else if (!CheckOneInput(item)) { VLOG(1) << "Falling back to slow path for Op \"" << op_def.name() << "\", Input \"" << op_def.input_arg(i).name() @@ -1443,6 +1544,52 @@ bool CheckInputsOk(PyObject* seq, int start_index, return true; } +PyObject* MaybeGetDType(PyObject* item) { + if (EagerTensor_CheckExact(item)) { + tensorflow::Safe_PyObjectPtr py_dtype( + PyObject_GetAttrString(item, "dtype")); + return PyObject_GetAttrString(py_dtype.get(), "_type_enum"); + } + + if (CheckResourceVariable(item)) { + tensorflow::Safe_PyObjectPtr py_dtype( + PyObject_GetAttrString(item, "_dtype")); + return PyObject_GetAttrString(py_dtype.get(), "_type_enum"); + } + + return nullptr; +} + +PyObject* MaybeGetDTypeForAttr(const string& attr, + FastPathOpExecInfo* op_exec_info) { + auto cached_it = op_exec_info->cached_dtypes.find(attr); + if (cached_it != op_exec_info->cached_dtypes.end()) { + return GetPythonObjectFromInt(cached_it->second); + } + + auto it = op_exec_info->attr_to_inputs_map->find(attr); + if (it == op_exec_info->attr_to_inputs_map->end()) { + // No other inputs - this should never happen. + Py_RETURN_NONE; + } + + for (const auto& input_info : it->second) { + PyObject* item = PyTuple_GET_ITEM( + op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i); + if (input_info.is_list) { + for (int i = 0; i < PySequence_Fast_GET_SIZE(item); i++) { + auto* dtype = MaybeGetDType(PySequence_Fast_GET_ITEM(item, i)); + if (dtype != nullptr) return dtype; + } + } else { + auto* dtype = MaybeGetDType(item); + if (dtype != nullptr) return dtype; + } + } + + Py_RETURN_NONE; +} + bool OpDoesntRequireOutput(const string& op_name) { static tensorflow::gtl::FlatSet* ops_that_dont_require_outputs = new tensorflow::gtl::FlatSet({ @@ -1668,23 +1815,80 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info, // i) input is an EagerTensor // ii) input is a ResourceVariable - in this case, the is_variable param is set // to true. -bool ConvertToTensor(const FastPathOpExecInfo& op_exec_info, PyObject* input, - tensorflow::Safe_PyObjectPtr* output_handle, - TF_Status* status) { - if (CheckResourceVariable(input)) { +// +// NOTE: dtype_hint_getter must *always* return a PyObject that can be +// decref'd. So if no hint is found, Py_RETURN_NONE (which correctly +// increfs Py_None). +bool ConvertToTensor( + const FastPathOpExecInfo& op_exec_info, PyObject* input, + tensorflow::Safe_PyObjectPtr* output_handle, + // This gets a hint for this particular input. + const std::function& dtype_hint_getter, + // This sets the dtype after conversion is complete. + const std::function& dtype_setter, + TF_Status* status) { + if (EagerTensor_CheckExact(input)) { + Py_INCREF(input); + output_handle->reset(input); + return true; + } else if (CheckResourceVariable(input)) { return ReadVariableOp(op_exec_info, input, output_handle, status); } - Py_INCREF(input); - output_handle->reset(input); + // The hint comes from a supposedly similarly typed tensor. + tensorflow::Safe_PyObjectPtr dtype_hint(dtype_hint_getter()); + if (PyErr_Occurred()) { + return false; + } + + tensorflow::Safe_TFE_TensorHandlePtr handle = + tensorflow::make_safe(static_cast( + tensorflow::ConvertToEagerTensor(input, dtype_hint.get()))); + if (handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "Unable to convert value to tensor"); + return false; + } + + int desired_dtype = -1; + if (dtype_hint.get() != Py_None) { + if (!ParseTypeValue("", dtype_hint.get(), status, &desired_dtype)) { + status->status = tensorflow::errors::InvalidArgument( + "Expecting a DataType value for dtype. Got ", + Py_TYPE(dtype_hint.get())->tp_name); + } + } + + TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get()); + if (desired_dtype >= 0 && desired_dtype != handle_dtype) { + handle = tensorflow::make_safe( + tensorflow::EagerCast(op_exec_info.ctx, handle.get(), handle_dtype, + static_cast(desired_dtype), status)); + if (!status->status.ok()) return false; + + handle_dtype = TFE_TensorHandleDataType(handle.get()); + } + + if (handle_dtype != TF_INT32) { + // Note that this is a shallow copy and will share the underlying buffer + // if copying to the same device. + handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice( + handle.get(), op_exec_info.ctx, op_exec_info.device_name, status)); + if (!status->status.ok()) return false; + } + + output_handle->reset(EagerTensorFromHandle(handle.release())); + + dtype_setter(handle_dtype); return true; } // Adds input and type attr to the op, and to the list of flattened // inputs/attrs. -bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input, - const tensorflow::OpDef::ArgDef* input_arg, +bool AddInputToOp(FastPathOpExecInfo* op_exec_info, PyObject* input, + const bool add_type_attr, + const tensorflow::OpDef::ArgDef& input_arg, std::vector* flattened_attrs, std::vector* flattened_inputs, TFE_Op* op, TF_Status* status) { @@ -1693,18 +1897,30 @@ bool AddInputToOp(const FastPathOpExecInfo& op_exec_info, PyObject* input, // out of scope in this function. tensorflow::Safe_PyObjectPtr py_eager_tensor = nullptr; - if (!ConvertToTensor(op_exec_info, input, &py_eager_tensor, status)) { + if (!ConvertToTensor( + *op_exec_info, input, &py_eager_tensor, + [&]() { + if (input_arg.type() != tensorflow::DataType::DT_INVALID) { + return GetPythonObjectFromInt(input_arg.type()); + } + return MaybeGetDTypeForAttr(input_arg.type_attr(), op_exec_info); + }, + [&](const TF_DataType dtype) { + op_exec_info->cached_dtypes[input_arg.type_attr()] = + static_cast(dtype); + }, + status)) { return false; } TFE_TensorHandle* input_handle = EagerTensor_Handle(py_eager_tensor.get()); - if (input_arg != nullptr && !input_arg->type_attr().empty()) { + if (add_type_attr && !input_arg.type_attr().empty()) { auto dtype = TFE_TensorHandleDataType(input_handle); - TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype); + TFE_OpSetAttrType(op, input_arg.type_attr().data(), dtype); if (flattened_attrs != nullptr) { flattened_attrs->emplace_back( - GetPythonObjectFromString(input_arg->type_attr().data())); + GetPythonObjectFromString(input_arg.type_attr().data())); flattened_attrs->emplace_back(PyLong_FromLong(dtype)); } } @@ -1844,6 +2060,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) { op_exec_info.ctx = reinterpret_cast( PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr)); + op_exec_info.args = args; if (op_exec_info.ctx == nullptr) { // The context hasn't been initialized. It will be in the slow path. @@ -1892,6 +2109,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) { return nullptr; } + op_exec_info.attr_to_inputs_map = GetAttrToInputsMap(*op_def); + TF_Status* status = TF_NewStatus(); TFE_Op* op = TFE_NewOp(op_exec_info.ctx, op_def->name().c_str(), status); auto cleaner = tensorflow::gtl::MakeCleanup([status, op] { @@ -1986,17 +2205,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) { if (len > 0) { // First item adds the type attr. - if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0), - &input_arg, flattened_attrs.get(), + if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0), + true, input_arg, flattened_attrs.get(), flattened_inputs.get(), op, status)) { return nullptr; } for (Py_ssize_t j = 1; j < len; j++) { // Since the list is homogeneous, we don't need to re-add the attr. - if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j), - nullptr /* input_arg */, - nullptr /* flattened_attrs */, + if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j), + false, input_arg, nullptr /* flattened_attrs */, flattened_inputs.get(), op, status)) { return nullptr; } @@ -2018,7 +2236,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) { PyObject* py_input = PySequence_Fast_GET_ITEM(input, j); tensorflow::Safe_PyObjectPtr py_eager_tensor; if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor, - status)) { + []() { Py_RETURN_NONE; }, + [](const TF_DataType& dtype) {}, status)) { return nullptr; } @@ -2048,8 +2267,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) { attr_list_sizes[attr_name] = len; } else { // The item is a single item. - if (!AddInputToOp(op_exec_info, input, &input_arg, flattened_attrs.get(), - flattened_inputs.get(), op, status)) { + if (!AddInputToOp(&op_exec_info, input, true, input_arg, + flattened_attrs.get(), flattened_inputs.get(), op, + status)) { return nullptr; } } diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py index 0bd5a5dbafd..b044b302316 100644 --- a/tensorflow/python/eager/tensor_test.py +++ b/tensorflow/python/eager/tensor_test.py @@ -278,14 +278,9 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp( TypeError, - r"tensor_list argument must be a list. Got \"EagerTensor\""): + r"tensors argument must be a list or a tuple. Got \"EagerTensor\""): pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2) - with self.assertRaisesRegexp( - TypeError, - r"tensor_list argument must be a list. Got \"tuple\""): - pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2) - def testNegativeSliceDim(self): t1 = _create_tensor([1, 2], dtype=dtypes.int32) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 662cda2a7d4..8cd6820f6a5 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -1385,6 +1385,22 @@ def register_tensor_conversion_function(base_type, if not callable(conversion_func): raise TypeError("conversion_func must be callable.") + # context._context is checked so that we don't inadvertently create it. + # This is because enable_eager_execution will fail when called from the main + # function if the context._context is already created, and the + # register_tensor_conversion_function calls happen when the module is + # imported. + if context._context is not None and context.executing_eagerly( + ) and isinstance(base_type, six.integer_types + ( + float, + np.ndarray, + )): + # TODO(nareshmodi): consider setting a context variable which disables the + # fastpath instead. + raise TypeError( + "Cannot register conversions for numpy arrays, python number types " + "when executing eagerly.") + try: funcs_at_priority = _tensor_conversion_func_registry[priority] except KeyError: From 76ea66f24d4370e6e7848b83fc0b571ba7edfa2d Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 20 Apr 2018 11:34:55 -0700 Subject: [PATCH 0529/1734] Move the guts of TFE_Op into EagerOperation PiperOrigin-RevId: 193698320 --- tensorflow/c/eager/BUILD | 2 + tensorflow/c/eager/c_api.cc | 230 +++++++++--------- tensorflow/c/eager/c_api_internal.h | 16 +- tensorflow/core/common_runtime/eager/BUILD | 16 ++ .../common_runtime/eager/eager_operation.cc | 33 +++ .../common_runtime/eager/eager_operation.h | 74 ++++++ 6 files changed, 242 insertions(+), 129 deletions(-) create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.cc create mode 100644 tensorflow/core/common_runtime/eager/eager_operation.h diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index 3e14c107272..d66386acbd6 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -51,6 +51,7 @@ tf_cuda_library( ], "//conditions:default": [], }) + [ + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core:gpu_runtime", ], ) @@ -73,6 +74,7 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", ], diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 369342b1425..b7a30972083 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -241,21 +241,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, void TFE_DeleteOp(TFE_Op* op) { delete op; } void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) { - tensorflow::Device* d = nullptr; - if (device_name != nullptr && strlen(device_name) > 0) { - status->status = op->ctx->context.FindDeviceByName(device_name, &d); - } - op->device = d; + status->status = op->operation.SetDevice(device_name); } const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) { - tensorflow::Device* device = - (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device; + tensorflow::Device* device = (op->operation.Device() == nullptr) + ? op->operation.EagerContext()->HostCPU() + : op->operation.Device(); return device->name().c_str(); } void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { - op->use_xla = enable; + op->operation.SetUseXla(enable); #ifndef TENSORFLOW_EAGER_USE_XLA LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not " "built with XLA support."; @@ -263,22 +260,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { } void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) { - h->handle->Ref(); - op->inputs.push_back(h->handle); - op->attrs.NumInputs(op->inputs.size()); + op->operation.AddInput(h->handle); } TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, unsigned char* is_list, TF_Status* status) { TF_AttrType ret; - if (op->is_function()) { + if (op->operation.is_function()) { status->status = tensorflow::errors::Unimplemented( "TODO(apassos): Support for attributes for TensorFlow functions is not " "ready yet."); return TF_ATTR_INT; // The compiler requires that we return something. } - status->status = - tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list); + status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(), + attr_name, &ret, is_list); return ret; } @@ -297,23 +292,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx, } void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, static_cast(value)); } void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) { - op->attrs.Set(attr_name, (value == 0) ? false : true); + op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true); } void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, + static_cast(value)); } void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, @@ -335,23 +331,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, proto.add_dim()->set_size(dims[d]); } } - op->attrs.Set(attr_name, proto); + op->operation.MutableAttrs()->Set(attr_name, proto); } void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name, const TFE_Op* value) { tensorflow::AttrValue attr_value; tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(value->name); - value->attrs.FillAttrValueMap(func->mutable_attr()); - op->attrs.Set(attr_name, attr_value); + func->set_name(value->operation.Name()); + value->operation.Attrs().FillAttrValueMap(func->mutable_attr()); + op->operation.MutableAttrs()->Set(attr_name, attr_value); } #define TFE_OP_SET_ATTR_LIST(fn, type) \ void fn(TFE_Op* op, const char* attr_name, const type* values, \ int num_values) { \ - op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice( \ - values, num_values)); \ + op->operation.MutableAttrs()->Set( \ + attr_name, \ + tensorflow::gtl::ArraySlice(values, num_values)); \ } TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*) TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) @@ -359,14 +356,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, const int64_t* values, int num_values) { - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - reinterpret_cast(values), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + reinterpret_cast(values), num_values)); } void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name, const TF_DataType* values, int num_values) { - op->attrs.Set( + op->operation.MutableAttrs()->Set( attr_name, tensorflow::gtl::ArraySlice( reinterpret_cast(values), num_values)); @@ -378,8 +375,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name, for (int i = 0; i < num_values; ++i) { b[i] = values[i]; } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice(b.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice(b.get(), num_values)); } void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, @@ -409,9 +406,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, } } } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - proto.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + proto.get(), num_values)); } void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, @@ -419,12 +416,12 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, std::unique_ptr funcs( new tensorflow::NameAttrList[num_values]); for (int i = 0; i < num_values; i++) { - funcs[i].set_name(value[i]->name); - value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr()); + funcs[i].set_name(value[i]->operation.Name()); + value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr()); } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - funcs.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + funcs.get(), num_values)); } } // extern "C" @@ -460,18 +457,19 @@ int StepStatsDeviceIndex(tensorflow::StepStats* step_stats, } tensorflow::Status ValidateInputTypeAndPlacement( - tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op, - const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) { + tensorflow::EagerContext* ctx, tensorflow::Device* op_device, + tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel, + tensorflow::RunMetadata* run_metadata) { tensorflow::Device* host_device = ctx->HostCPU(); const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types(); - if (memtypes.size() != op->inputs.size()) { + if (memtypes.size() != op->Inputs().size()) { return tensorflow::errors::InvalidArgument( - "expected ", memtypes.size(), " inputs, got ", op->inputs.size()); + "expected ", memtypes.size(), " inputs, got ", op->Inputs().size()); } - for (int i = 0; i < op->inputs.size(); ++i) { + for (int i = 0; i < op->Inputs().size(); ++i) { const tensorflow::Device* expected_device = memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device; - tensorflow::TensorHandle* handle = op->inputs[i]; + tensorflow::TensorHandle* handle = op->Inputs()[i]; tensorflow::Device* handle_device = nullptr; TF_RETURN_IF_ERROR(handle->Device(&handle_device)); const tensorflow::Device* actual_device = @@ -491,7 +489,7 @@ tensorflow::Status ValidateInputTypeAndPlacement( return tensorflow::errors::InvalidArgument( "Tensors on conflicting devices:" " cannot compute ", - op->name, " as input #", i, " was expected to be on ", + op->Name(), " as input #", i, " was expected to be on ", expected_device->name(), " but is actually on ", actual_device->name(), " (operation running on ", op_device->name(), ")", @@ -502,7 +500,7 @@ tensorflow::Status ValidateInputTypeAndPlacement( "between devices" " may slow down your model"); case tensorflow::DEVICE_PLACEMENT_WARN: - LOG(WARNING) << "before computing " << op->name << " input #" << i + LOG(WARNING) << "before computing " << op->Name() << " input #" << i << " was expected to be on " << expected_device->name() << " but is actually on " << actual_device->name() << " (operation running on " << op_device->name() @@ -534,16 +532,16 @@ tensorflow::Status ValidateInputTypeAndPlacement( if (copied_tensor != nullptr) copied_tensor->Unref(); return tensorflow::errors::Internal( "Failed copying input tensor from ", actual_device->name(), " to ", - expected_device->name(), " in order to run ", op->name, ": ", + expected_device->name(), " in order to run ", op->Name(), ": ", status.error_message()); } handle->Unref(); handle = copied_tensor; - op->inputs[i] = copied_tensor; + (*op->MutableInputs())[i] = copied_tensor; } if (handle->dtype != kernel->input_type(i)) { return tensorflow::errors::InvalidArgument( - "cannot compute ", op->name, " as input #", i, + "cannot compute ", op->Name(), " as input #", i, " was expected to be a ", tensorflow::DataTypeString(kernel->input_type(i)), " tensor but is a ", tensorflow::DataTypeString(handle->dtype), @@ -554,9 +552,10 @@ tensorflow::Status ValidateInputTypeAndPlacement( } tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef, - TFE_Context* ctx, TF_Status* status) { + tensorflow::EagerContext* ctx, + TF_Status* status) { tensorflow::DeviceSet ds; - for (tensorflow::Device* d : *ctx->context.devices()) { + for (tensorflow::Device* d : *ctx->devices()) { ds.AddDevice(d); } tensorflow::DeviceTypeVector final_devices; @@ -570,7 +569,7 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef, "Could not find valid device for node ", ndef.DebugString()); return nullptr; } - for (tensorflow::Device* d : *ctx->context.devices()) { + for (tensorflow::Device* d : *ctx->devices()) { if (d->device_type() == final_devices[0].type_string()) { return d; } @@ -599,15 +598,16 @@ const tensorflow::FunctionDef* OpToFunction( std::vector* arg_input_types, tensorflow::gtl::FlatMap* op_input_to_func_input, TF_Status* status) { - DCHECK(!op->is_function()); + DCHECK(!op->operation.is_function()); tensorflow::FunctionDef fdef; // Get the OpDef of the op we are trying to encapsulate. - TFE_Context* ctx = op->ctx; + TFE_Context* ctx = op->operation.ctx; const tensorflow::OpRegistrationData* op_data; { - status->status = ctx->context.FindFunctionOpData(op->name, &op_data); + status->status = + ctx->context.FindFunctionOpData(op->operation.Name(), &op_data); if (!status->status.ok()) { return nullptr; } @@ -618,7 +618,8 @@ const tensorflow::FunctionDef* OpToFunction( // Handle constant inputs. const std::unordered_set const_inputs( - *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name)); + *tensorflow::XlaOpRegistry::CompileTimeConstantInputs( + op->operation.Name())); // First add place holders for the input args, so that we can refer to them by // position in the next loop. Also tally up the resource inputs. @@ -644,7 +645,7 @@ const tensorflow::FunctionDef* OpToFunction( (*op_input_to_func_input)[i] = const_index; func_input_arg = signature->mutable_input_arg(const_index++); const_input_types->push_back( - static_cast(op->inputs[i]->dtype)); + static_cast(op->operation.Inputs()[i]->dtype)); } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) { VLOG(1) << "For resource input, mapping op input " << i << " to func input " << resource_index; @@ -656,11 +657,11 @@ const tensorflow::FunctionDef* OpToFunction( (*op_input_to_func_input)[i] = arg_index; func_input_arg = signature->mutable_input_arg(arg_index++); arg_input_types->push_back( - static_cast(op->inputs[i]->dtype)); + static_cast(op->operation.Inputs()[i]->dtype)); } func_input_arg->set_name(op_input_arg.name()); - func_input_arg->set_type(op->inputs[i]->dtype); + func_input_arg->set_type(op->operation.Inputs()[i]->dtype); } VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString(); @@ -673,7 +674,8 @@ const tensorflow::FunctionDef* OpToFunction( op_def.name(), func_id_generator.fetch_add(1))); // Add the node def and set its input names to match op_def's names. - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); + const tensorflow::NodeDef& ndef = + op->operation.MutableAttrs()->BuildNodeDef(); DCHECK_EQ(signature->input_arg_size(), ndef.input_size()); *fdef.add_node_def() = ndef; for (int i = 0; i < op_def.input_arg_size(); ++i) { @@ -713,17 +715,18 @@ const tensorflow::FunctionDef* OpToFunction( // Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed // via XLA. std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { - VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name; - auto launch_op = - std::unique_ptr(TFE_NewOp(op->ctx, "_XlaLaunch", status)); + VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name(); + auto launch_op = std::unique_ptr( + TFE_NewOp(op->operation.ctx, "_XlaLaunch", status)); if (TF_GetCode(status) != TF_OK) return nullptr; - if (op->device) { - TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status); + if (op->operation.device) { + TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(), + status); if (TF_GetCode(status) != TF_OK) return nullptr; } const tensorflow::FunctionDef* fdef; - { fdef = op->ctx->context.FindFunctionDef(op->name); } + { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); } std::vector const_input_types; std::vector arg_input_types; tensorflow::gtl::FlatMap op_input_to_func_input; @@ -748,20 +751,21 @@ std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { // Copy inputs and their devices. // Since input param reordering may have occurred between `op` and `launch_op` // via `op_input_to_func_input`, adjust the actual inputs accordingly. - launch_op->inputs = op->inputs; - for (tensorflow::TensorHandle* h : launch_op->inputs) { + *launch_op->operation.MutableInputs() = op->operation.Inputs(); + for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) { h->Ref(); } if (!op_input_to_func_input.empty()) { - DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size()); + DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size()); for (int i = 0; i < op_input_to_func_input.size(); ++i) { VLOG(1) << "mapping op input " << i << " to func input " << op_input_to_func_input[i]; - launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i]; + (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] = + op->operation.Inputs()[i]; } } - launch_op->attrs.NumInputs(op->inputs.size()); + launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size()); TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(), const_input_types.size()); @@ -796,16 +800,17 @@ std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { extern "C" { -void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, +void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals, TF_Status* status) { - TFE_Context* ctx = op->ctx; - status->status = ctx->context.GetStatus(); + tensorflow::EagerOperation* op = &tfe_op->operation; + tensorflow::EagerContext* ctx = op->EagerContext(); + status->status = ctx->GetStatus(); if (!status->status.ok()) { return; } #ifdef TENSORFLOW_EAGER_USE_XLA std::unique_ptr xla_launch_op; - if (op->use_xla && op->name != "_XlaLaunch") { + if (op->UseXla() && op->Name() != "_XlaLaunch") { xla_launch_op = BuildXlaLaunch(op, status); if (!status->status.ok()) { return; @@ -816,31 +821,31 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, // Ensure all resource-touching ops run in the device the resource is, // regardless of anything else that has been specified. This is identical to // the graph mode behavior. - for (int i = 0; i < op->inputs.size(); ++i) { + for (int i = 0; i < op->Inputs().size(); ++i) { tensorflow::Device* input_op_device = nullptr; - status->status = op->inputs[i]->OpDevice(&input_op_device); + status->status = op->Inputs()[i]->OpDevice(&input_op_device); if (!status->status.ok()) return; - VLOG(2) << "for op " << op->name << " input " << i << " " - << tensorflow::DataTypeString(op->inputs[i]->dtype) << " " + VLOG(2) << "for op " << op->Name() << " input " << i << " " + << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " " << (input_op_device == nullptr ? "cpu" : input_op_device->name()) - << " " << (op->device == nullptr ? "cpu" : op->device->name()); - if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE && - (input_op_device != op->device || input_op_device == nullptr)) { + << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name()); + if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE && + (input_op_device != op->Device() || input_op_device == nullptr)) { tensorflow::Device* d = - input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device; - VLOG(1) << "Changing device of operation " << op->name << " to " + input_op_device == nullptr ? ctx->HostCPU() : input_op_device; + VLOG(1) << "Changing device of operation " << op->Name() << " to " << d->name() << " because input #" << i << " is a resource in this device."; - op->device = d; + op->SetDevice(d); } } - tensorflow::Device* device = op->device; + tensorflow::Device* device = op->Device(); - tensorflow::Fprint128 cache_key = - op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name()); - tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key); + tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey( + device == nullptr ? "unspecified" : device->name()); + tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key); if (kernel == nullptr) { - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); + const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef(); if (device == nullptr) { device = SelectDevice(ndef, ctx, status); if (!status->status.ok()) { @@ -848,19 +853,19 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, } } CHECK(device != nullptr); - if (ctx->context.LogDevicePlacement()) { + if (ctx->LogDevicePlacement()) { LOG(INFO) << "Executing op " << ndef.op() << " in device " << device->name(); } - kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous()); + kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous()); // Knowledge of the implementation of Init (and in-turn // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def // will be accessed, so grab on to the lock. // See WARNING comment in Execute (before kernel->Run) - would be nice to // rework to avoid this subtlety. - tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu()); - status->status = tensorflow::KernelAndDevice::Init( - ndef, ctx->context.func_lib(device), kernel); + tensorflow::tf_shared_lock l(*ctx->FunctionsMu()); + status->status = + tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel); if (!status->status.ok()) { delete kernel; return; @@ -868,7 +873,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, // Update output_dtypes inside `kernel`. const tensorflow::OpDef* op_def = nullptr; const tensorflow::FunctionDef* function_def = - ctx->context.FuncLibDef()->Find(ndef.op()); + ctx->FuncLibDef()->Find(ndef.op()); if (function_def != nullptr) { op_def = &(function_def->signature()); } @@ -884,7 +889,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, if (!status->status.ok()) { return; } - ctx->context.AddKernelToCache(cache_key, kernel); + ctx->AddKernelToCache(cache_key, kernel); } const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes(); const int output_dtypes_size = output_dtypes.size(); @@ -903,43 +908,42 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, device = kernel->device(); } status->status = ValidateInputTypeAndPlacement( - &ctx->context, device, op, kernel->kernel(), - ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto() - : nullptr); + ctx, device, op, kernel->kernel(), + ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr); if (!status->status.ok()) return; std::unique_ptr maybe_stats; - if (ctx->context.ShouldStoreMetadata()) { + if (ctx->ShouldStoreMetadata()) { maybe_stats.reset(new tensorflow::NodeExecStats); - maybe_stats->set_node_name(op->name); + maybe_stats->set_node_name(op->Name()); maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros()); maybe_stats->set_op_start_rel_micros(0); maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros()); // TODO(apassos) track referenced tensors } - if (ctx->context.Async()) { + if (ctx->Async()) { // Note that for async mode, execution order will make sure that all // input handles are ready before executing them. // TODO(agarwal): Consider executing "cheap" kernels inline for performance. tensorflow::gtl::InlinedVector handle_retvals( *num_retvals); - tensorflow::uint64 id = op->ctx->context.NextId(); + tensorflow::uint64 id = ctx->NextId(); for (int i = 0; i < *num_retvals; ++i) { tensorflow::TensorHandle* h = - new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context); + new tensorflow::TensorHandle(id, output_dtypes[i], ctx); retvals[i] = new TFE_TensorHandle(h); handle_retvals[i] = h; } tensorflow::EagerNode* node = new tensorflow::ExecuteNode( - id, &op->ctx->context, op->device, op->inputs, kernel, - maybe_stats.release(), output_dtypes, handle_retvals); - ctx->context.ExecutorAdd(node); + id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(), + output_dtypes, handle_retvals); + ctx->ExecutorAdd(node); } else { // Execute checks if retvals[i] is nullptr or not to figure if it needs to // allocate it. tensorflow::gtl::InlinedVector handle_retvals( *num_retvals); status->status = tensorflow::EagerExecute( - &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(), + ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(), handle_retvals.data(), *num_retvals); for (int i = 0; i < *num_retvals; ++i) { retvals[i] = new TFE_TensorHandle(handle_retvals[i]); @@ -1142,9 +1146,3 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op, } } } // namespace tensorflow - -TFE_Op::~TFE_Op() { - for (tensorflow::TensorHandle* h : inputs) { - h->Unref(); - } -} diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 05dc64f5217..49e1aab1cef 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/eager_executor.h" +#include "tensorflow/core/common_runtime/eager/eager_operation.h" #include "tensorflow/core/common_runtime/eager/kernel_and_device.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/common_runtime/function.h" @@ -45,7 +46,6 @@ limitations under the License. #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/public/version.h" - struct TFE_ContextOptions { TF_SessionOptions session_options; // true if async execution is enabled. @@ -85,19 +85,9 @@ struct TFE_Op { // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a // primitive operation. TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t) - : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {} + : operation(&ctx->context, op, t) {} - ~TFE_Op(); - - bool const is_function() const { return attr_types == nullptr; } - - TFE_Context* ctx; // Must outlive the TFE_Op. - const tensorflow::string name; - tensorflow::AttrBuilder attrs; - const tensorflow::AttrTypeMap* attr_types; - tensorflow::gtl::InlinedVector inputs; - tensorflow::Device* device; - bool use_xla = false; + tensorflow::EagerOperation operation; }; namespace tensorflow { diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD index 941a0e61c75..00ac4a4e478 100644 --- a/tensorflow/core/common_runtime/eager/BUILD +++ b/tensorflow/core/common_runtime/eager/BUILD @@ -54,6 +54,22 @@ tf_cuda_library( ], ) +tf_cuda_library( + name = "eager_operation", + srcs = [ + "eager_operation.cc", + ], + hdrs = [ + "eager_operation.h", + ], + visibility = ["//tensorflow:internal"], + deps = [ + ":context", + ":tensor_handle", + "//tensorflow/c/eager:runtime", + ], +) + tf_cuda_library( name = "tensor_handle", srcs = [ diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc new file mode 100644 index 00000000000..381b05ada85 --- /dev/null +++ b/tensorflow/core/common_runtime/eager/eager_operation.cc @@ -0,0 +1,33 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/eager/eager_operation.h" + +namespace tensorflow { +tensorflow::Status EagerOperation::SetDevice(const char* device) { + auto status = Status::OK(); + tensorflow::Device* d = nullptr; + if (device != nullptr && strlen(device) > 0) { + status.Update(ctx_->FindDeviceByName(device, &d)); + } + device_ = d; + return status; +} + +void EagerOperation::AddInput(tensorflow::TensorHandle* h) { + h->Ref(); + inputs_.push_back(h); + attrs_.NumInputs(static_cast(inputs_.size())); +} +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h new file mode 100644 index 00000000000..6b6e53da87a --- /dev/null +++ b/tensorflow/core/common_runtime/eager/eager_operation.h @@ -0,0 +1,74 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_ + +#include "tensorflow/c/eager/runtime.h" +#include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/common_runtime/eager/tensor_handle.h" + +namespace tensorflow { +class EagerOperation { + public: + // t is NULL iff the EagerOperation corresponds to a TensorFlow function + // instead of a primitive operation. + EagerOperation(tensorflow::EagerContext* ctx, const char* op, + const tensorflow::AttrTypeMap* t) + : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {} + + ~EagerOperation() { + for (tensorflow::TensorHandle* h : inputs_) { + h->Unref(); + } + } + + bool is_function() const { return attr_types_ == nullptr; } + + tensorflow::EagerContext* EagerContext() { return ctx_; } + + tensorflow::AttrBuilder* MutableAttrs() { return &attrs_; } + const tensorflow::AttrBuilder& Attrs() const { return attrs_; } + + const tensorflow::gtl::InlinedVector& Inputs() + const { + return inputs_; + } + tensorflow::gtl::InlinedVector* + MutableInputs() { + return &inputs_; + } + void AddInput(tensorflow::TensorHandle* h); + + const tensorflow::string& Name() const { return name_; } + const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; } + + tensorflow::Device* Device() const { return device_; } + tensorflow::Status SetDevice(const char* device); + void SetDevice(tensorflow::Device* device) { device_ = device; } + + void SetUseXla(bool use_xla) { use_xla_ = use_xla; } + + private: + tensorflow::EagerContext* ctx_; // Must outlive the EagerOperation. + const tensorflow::string name_; + tensorflow::AttrBuilder attrs_; + const tensorflow::AttrTypeMap* attr_types_; + tensorflow::gtl::InlinedVector inputs_; + tensorflow::Device* device_; + bool use_xla_ = false; +}; +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_ From 2b0b015ebb1c33a409836bd1c9c98124dfd841ec Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 11:43:48 -0700 Subject: [PATCH 0530/1734] [XLA] Fix a bug in ToProto: don't add gather attributes twice. PiperOrigin-RevId: 193699745 --- tensorflow/compiler/xla/service/hlo_instruction.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index a638d54d852..a714d0e1142 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2451,12 +2451,6 @@ HloInstructionProto HloInstruction::ToProto() const { proto.add_fft_length(fft_len); } - if (gather_dimension_numbers_ != nullptr) { - *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_; - } - for (int64 bound : gather_window_bounds_) { - proto.add_gather_window_bounds(bound); - } proto.set_channel_name(channel_name_); proto.set_cost_estimate_ns(cost_estimate_ns_); From 0074dffd076e0faf4da5913aebfa594ef925d6c7 Mon Sep 17 00:00:00 2001 From: Anna R Date: Fri, 20 Apr 2018 12:01:21 -0700 Subject: [PATCH 0531/1734] Prefix compat import with underscore in meta_graph_transform.py so that it doesn't get exported as part of API: https://www.tensorflow.org/versions/r1.8/api_docs/python/tf/contrib/meta_graph_transform/meta_graph_transform PiperOrigin-RevId: 193702570 --- .../meta_graph_transform/meta_graph_transform.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py index ff88b4fa841..4090c1ff3e5 100644 --- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py +++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py @@ -30,7 +30,7 @@ from tensorflow.python.framework import importer as _importer from tensorflow.python.framework import ops as _ops from tensorflow.python.saved_model import constants as _saved_model_constants from tensorflow.python.training import saver as _saver_lib -from tensorflow.python.util import compat +from tensorflow.python.util import compat as _compat from tensorflow.tools import graph_transforms as _graph_transforms @@ -161,7 +161,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names): shapes = [] dtypes = [] for index, value in enumerate(name_op_value_tensor.string_val): - if not _is_removed(compat.as_str(value), removed_op_names): + if not _is_removed(_compat.as_str(value), removed_op_names): names.append(value) shapes.append(shape_op_value_tensor.string_val[index]) dtypes.append(op.attr['dtypes'].list.type[index]) @@ -651,7 +651,7 @@ def _is_removed_mentioned(s, removed_op_names): # /foo/bar. This regex ensures that we handle these two nodes # as separate entities. It matches on nodes having names in the form of # '/foo/bar_x' as well as nodes having names in the form of 'foo.' - s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s)) + s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s)) for removed_op_name in removed_op_names: for s_name in s_names: if s_name.endswith(removed_op_name): @@ -737,9 +737,9 @@ def meta_graph_transform( for tag in tags: meta_graph_def.meta_info_def.tags.append(tag) - base_op_names = [compat.as_str(node.name) + base_op_names = [_compat.as_str(node.name) for node in base_meta_graph_def.graph_def.node] - retained_op_names = [compat.as_str(node.name) + retained_op_names = [_compat.as_str(node.name) for node in meta_graph_def.graph_def.node] removed_op_names = set(base_op_names) - set(retained_op_names) From 1b5839e6acad5d360ea9e5b94226b30047924cb9 Mon Sep 17 00:00:00 2001 From: Dimitris Vardoulakis Date: Fri, 20 Apr 2018 12:02:56 -0700 Subject: [PATCH 0532/1734] [TF:XLA] Now that the compiler no longer introduces implicit broadcasts, forbid them in the HLO verifier. PiperOrigin-RevId: 193702874 --- tensorflow/compiler/xla/service/BUILD | 1 + .../compiler/xla/service/hlo_verifier.cc | 21 ++++++++ .../compiler/xla/service/hlo_verifier.h | 4 ++ .../xla/service/reshape_mover_test.cc | 51 ------------------- 4 files changed, 26 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 9009cbf845e..9555d918178 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -2032,6 +2032,7 @@ cc_library( srcs = ["hlo_verifier.cc"], hdrs = ["hlo_verifier.h"], deps = [ + ":hlo", ":hlo_pass", ":shape_inference", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 80ed6d68324..8a30cbf9cd6 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -15,6 +15,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/lib/core/errors.h" @@ -780,6 +781,24 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { return tensorflow::Status::OK(); } +Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) { + const Shape& out_shape = instruction->shape(); + for (HloInstruction* operand : instruction->operands()) { + const Shape& operand_shape = operand->shape(); + if (!ShapeUtil::IsScalar(operand_shape) && + !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) { + return FailedPrecondition( + "Implicit broadcast is not allowed in HLO." + "Found non-compatible shapes for instruction %s.\n" + "output: %s\noperand: %s\n", + HloOpcodeString(instruction->opcode()).c_str(), + ShapeUtil::HumanString(out_shape).c_str(), + ShapeUtil::HumanString(operand_shape).c_str()); + } + } + return tensorflow::Status::OK(); +} + StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(VerifyHloStructure(module)); @@ -821,6 +840,8 @@ StatusOr HloVerifier::Run(HloModule* module) { << " != " << ShapeUtil::Rank(instruction->operand(0)->shape()); } else if (instruction->opcode() == HloOpcode::kWhile) { TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction)); + } else if (instruction->IsElementwise()) { + TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction)); } auto previous = instructions.find(instruction->name()); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 1ec55a9bdc9..6208887547a 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -146,6 +146,10 @@ class HloVerifier : public HloPassInterface { Status CheckWhileInstruction(HloInstruction* instruction); + // Checks that the non-scalar operand shapes are compatible to the output + // shape, i.e., that there are no implicit broadcasts of size-one dimensions. + Status CheckElementwiseInstruction(HloInstruction* instruction); + // Creates a ShapeVerifier that checks that shapes match inferred // expectations. This is a factory function because ShapeVerifier, // being a DfsHloVisitor, is stateful. We want a clean object diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc index 094f7319f46..13e2d3258e3 100644 --- a/tensorflow/compiler/xla/service/reshape_mover_test.cc +++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc @@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) { EXPECT_EQ(select, computation->root_instruction()); } -// Tree looks like: -// -// param0 [1,128,1] -// | -// reshape [128,1] constant [128,1024] -// \ / -// multiply w/implicit broadcast [128,1024] -// -// The reshape mover would like to sink the reshape below the multiply. -// -// Previously we would attempt to insert a reshape of the constant to [1,128,1] -// (which is unsound, because it has a different number of elements) as -// preparation for sinking the reshape. -// -// To eliminate the unsoundness, we outlaw reshape sinking when one of the -// operands is implicitly broadcast in the elementwise consumer. -// -// TODO(b/37799338) However, it would be possible in this case to do a more -// in-depth analysis to get reshape movement to occur: -// -// 1. Note that the broadcast dimension (logical dimension 1) in the operands -// would map back to logical dimension 2 in the param0 node. -// 2. Match rank of the constant to the param0 node (by prepending a trivial 1 -// dimension). -// 3. Reshape to [128,1024] at the root. -// -// But this is not currently done. -TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) { - HloComputation::Builder builder(TestName()); - auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0")); - auto reshape = builder.AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(F32, {128, 1}), param0)); - Array2D a(128, 1024); - auto literal = Literal::CreateR2FromArray2D(a); - auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(std::move(literal))); - auto multiply = builder.AddInstruction(HloInstruction::CreateBinary( - constant->shape(), HloOpcode::kMultiply, constant, reshape)); - - auto computation = module().AddEntryComputation(builder.Build()); - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - - EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - EXPECT_EQ(multiply, computation->root_instruction()); -} - // Tree looks like this: // // add1 From ceed923d600584ade8d159271422b4a08f728cbb Mon Sep 17 00:00:00 2001 From: Yangzihao Wang Date: Fri, 20 Apr 2018 12:05:11 -0700 Subject: [PATCH 0533/1734] Add native dilated support for conv3d and its gradients in cudnn v>=6. PiperOrigin-RevId: 193703316 --- tensorflow/core/framework/common_shape_fns.cc | 32 ++- .../core/framework/common_shape_fns_test.cc | 55 ++++- tensorflow/core/kernels/conv_grad_ops_3d.cc | 115 +++++++++- tensorflow/core/kernels/conv_ops_3d.cc | 52 ++++- tensorflow/core/ops/nn_ops.cc | 2 + .../python/kernel_tests/conv_ops_3d_test.py | 196 +++++++++++++++++- tensorflow/python/ops/nn_grad.py | 6 + 7 files changed, 426 insertions(+), 32 deletions(-) diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc index 72eeda7a43e..0916c9b7a85 100644 --- a/tensorflow/core/framework/common_shape_fns.cc +++ b/tensorflow/core/framework/common_shape_fns.cc @@ -487,6 +487,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) { string data_format; Status s = c->GetAttr("data_format", &data_format); + std::vector dilations; + TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations)); + + if (dilations.size() != 5) { + return errors::InvalidArgument( + "Conv3D requires the dilation attribute to contain 5 values, but got: ", + dilations.size()); + } + std::vector strides; TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides)); if (strides.size() != 5) { @@ -496,6 +505,7 @@ Status Conv3DShape(shape_inference::InferenceContext* c) { } int32 stride_planes, stride_rows, stride_cols; + int32 dilation_planes, dilation_rows, dilation_cols; if (s.ok() && data_format == "NCDHW") { // Convert input_shape to NDHWC. auto dim = [&](char dimension) { @@ -506,10 +516,16 @@ Status Conv3DShape(shape_inference::InferenceContext* c) { stride_planes = strides[2]; stride_rows = strides[3]; stride_cols = strides[4]; + dilation_planes = dilations[2]; + dilation_cols = dilations[3]; + dilation_rows = dilations[4]; } else { stride_planes = strides[1]; stride_rows = strides[2]; stride_cols = strides[3]; + dilation_planes = dilations[1]; + dilation_cols = dilations[2]; + dilation_rows = dilations[3]; } DimensionHandle batch_size_dim = c->Dim(input_shape, 0); @@ -530,13 +546,15 @@ Status Conv3DShape(shape_inference::InferenceContext* c) { TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding)); DimensionHandle output_planes, output_rows, output_cols; - TF_RETURN_IF_ERROR( - GetWindowedOutputSizeFromDims(c, in_planes_dim, filter_planes_dim, - stride_planes, padding, &output_planes)); - TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims( - c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows)); - TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims( - c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols)); + TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2( + c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes, + padding, &output_planes)); + TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2( + c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding, + &output_rows)); + TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2( + c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding, + &output_cols)); ShapeHandle output_shape; if (data_format == "NCDHW") { diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc index 13d429b8951..919e0967c03 100644 --- a/tensorflow/core/framework/common_shape_fns_test.cc +++ b/tensorflow/core/framework/common_shape_fns_test.cc @@ -644,15 +644,19 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) { .Finalize(&op.node_def)); }; - // 1x1x1 filter - set_op({{1, 1, 1, 1, 1}}, "VALID"); - INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]"); - // Invalid rank for input INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]"); // Invalid rank for filter INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]"); + // Invalid value for strides + set_op({{1, 1, 1, 0, 1}}, "VALID"); + INFER_ERROR("must be > 0", op, "[1,2,2,2,1];[1,1,1,1,1]"); + + // 1x1x1 filter + set_op({{1, 1, 1, 1, 1}}, "VALID"); + INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]"); + // unknown dims in the critical fields give partial inference. INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]"); INFER_OK(op, "[1,?,2,2,1];[1,1,1,1,1]", "[d0_0,?,2,2,d1_4]"); @@ -712,6 +716,49 @@ TEST(CommonShapeFnsTest, Conv3DShapeTest) { INFER_OK(op, "[1,4,9,4,1];[2,2,2,1,?]", "[d0_0,2,3,1,d1_4]"); } +TEST(CommonShapeFnsTest, Conv3DDilatedShapeTest) { + ShapeInferenceTestOp op("Conv3D"); + auto set_op = [&op](const std::vector& dilations, + const std::vector& strides, + const string& padding) { + TF_CHECK_OK(NodeDefBuilder("test", "Conv3D") + .Input("input", 0, DT_FLOAT) + .Input("filter", 0, DT_FLOAT) + .Attr("dilations", dilations) + .Attr("strides", strides) + .Attr("padding", padding) + .Finalize(&op.node_def)); + }; + + // Invalid rank for dilation + set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID"); + INFER_ERROR("contain 5 values", op, "[1,2,2,2,1];[1,1,1,1,1]"); + + // Invalid value for dilation + set_op({{1, 2, 0, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID"); + INFER_ERROR("must be >= 1", op, "[1,2,2,2,1];[1,1,1,1,1]"); + + // 2x1x1 dilation 1x1x1 filter + set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID"); + INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]"); + + // 2x1x1 dilation 2x2x2 filter + set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "VALID"); + INFER_OK(op, "[1,3,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]"); + + // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x2x2 stride + set_op({{1, 2, 1, 1, 1}}, {{1, 2, 2, 2, 1}}, "VALID"); + INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]"); + + // 2x1x1 dilation 3x3x3 input, 1x1x1 filter, 2x1x1 stride + set_op({{1, 2, 1, 1, 1}}, {{1, 2, 1, 1, 1}}, "VALID"); + INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]"); + + // 2x1x1 dilation 4x4x4 input, 2x2x2 filter, 1x1x1 stride + set_op({{1, 2, 1, 1, 1}}, {{1, 1, 1, 1, 1}}, "SAME"); + INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,d0_1,d0_2,d0_3,d1_4]"); +} + TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) { ShapeInferenceTestOp op("DepthwiseConv2dNative"); std::vector strides = {{1, 1, 1, 1}}; diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index 1234997bc57..092e859a5be 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -79,13 +79,18 @@ typedef Eigen::GpuDevice GPUDevice; context, out_depth == GetTensorDim(out_backprop, data_format_, 'C'), \ errors::InvalidArgument( \ label, ": filter and out_backprop must have the same out_depth")); \ + const std::array dilations = { \ + {GetTensorDim(dilation_, data_format_, '0'), \ + GetTensorDim(dilation_, data_format_, '1'), \ + GetTensorDim(dilation_, data_format_, '2')}}; \ const std::array strides = { \ {GetTensorDim(stride_, data_format_, '0'), \ GetTensorDim(stride_, data_format_, '1'), \ GetTensorDim(stride_, data_format_, '2')}}; \ std::array out, padding; \ - OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides, \ - padding_, &out, &padding)); \ + OP_REQUIRES_OK( \ + context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides, \ + padding_, &out, &padding)); \ OP_REQUIRES(context, output_planes == out[0], \ errors::InvalidArgument( \ label, \ @@ -151,6 +156,26 @@ class Conv3DBackpropInputOp : public OpKernel { "Conv3DBackpropInputOpV2 only supports NDHWC on the CPU.")); } + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + + // TODO(yangzihao): Add CPU version of dilated conv 3D. + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, '0') == 1 && + GetTensorDim(dilation_, data_format_, '1') == 1 && + GetTensorDim(dilation_, data_format_, '2') == 1), + errors::InvalidArgument( + "Current CPU implementation does not yet support " + "dilation rates larger than 1.")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); OP_REQUIRES(context, stride_.size() == 5, errors::InvalidArgument("Sliding window strides field must " @@ -223,6 +248,7 @@ class Conv3DBackpropInputOp : public OpKernel { } private: + std::vector dilation_; std::vector stride_; Padding padding_; TensorFormat data_format_; @@ -261,6 +287,26 @@ class Conv3DBackpropFilterOp : public OpKernel { "Conv3DBackpropFilterOpV2 only supports NDHWC on the CPU.")); } + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + + // TODO(yangzihao): Add CPU version of dilated conv 3D. + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, '0') == 1 && + GetTensorDim(dilation_, data_format_, '1') == 1 && + GetTensorDim(dilation_, data_format_, '2') == 1), + errors::InvalidArgument( + "Current CPU implementation does not yet support " + "dilation rates larger than 1.")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); OP_REQUIRES(context, stride_.size() == 5, errors::InvalidArgument("Sliding window strides field must " @@ -370,6 +416,7 @@ class Conv3DBackpropFilterOp : public OpKernel { } private: + std::vector dilation_; std::vector stride_; Padding padding_; TensorFormat data_format_; @@ -438,6 +485,22 @@ class Conv3DBackpropInputOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); } + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(dilation_, data_format_, '0') > 0 && + GetTensorDim(dilation_, data_format_, '1') > 0 && + GetTensorDim(dilation_, data_format_, '2') > 0), + errors::InvalidArgument("Dilated rates should be larger than 0.")); OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); OP_REQUIRES(context, stride_.size() == 5, errors::InvalidArgument("Sliding window strides field must " @@ -448,6 +511,12 @@ class Conv3DBackpropInputOp : public OpKernel { GetTensorDim(stride_, data_format_, 'N') == 1), errors::InvalidArgument("Current implementation does not yet support " "strides in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(stride_, data_format_, '0') > 0 && + GetTensorDim(stride_, data_format_, '1') > 0 && + GetTensorDim(stride_, data_format_, '2') > 0), + errors::InvalidArgument("Spatial strides should be larger than 0.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); cudnn_use_autotune_ = CudnnUseAutotune(); } @@ -471,6 +540,7 @@ class Conv3DBackpropInputOp : public OpKernel { OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); if (filter_size[0] == 1 && filter_size[1] == 1 && filter_size[2] == 1 && + dilation_[0] == 1 && dilation_[1] == 1 && dilation_[2] == 1 && stride_[0] == 1 && stride_[1] == 1 && stride_[2] == 1 && data_format_ == FORMAT_NHWC) { const uint64 m = batch * input_size[0] * input_size[1] * input_size[2]; @@ -580,7 +650,10 @@ class Conv3DBackpropInputOp : public OpKernel { .set_input_feature_map_count(in_depth) .set_output_feature_map_count(out_depth); perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3); - conv_desc.set_filter_stride(DimIndex::X, strides[2]) + conv_desc.set_dilation_rate(DimIndex::X, dilations[2]) + .set_dilation_rate(DimIndex::Y, dilations[1]) + .set_dilation_rate(DimIndex::Z, dilations[0]) + .set_filter_stride(DimIndex::X, strides[2]) .set_filter_stride(DimIndex::Y, strides[1]) .set_filter_stride(DimIndex::Z, strides[0]) .set_zero_padding(DimIndex::X, padding_cols / 2) @@ -645,9 +718,7 @@ class Conv3DBackpropInputOp : public OpKernel { {{input_size[0], input_size[1], input_size[2]}}, out_depth, {{filter_size[0], filter_size[1], filter_size[2]}}, - // TODO(yangzihao): Send in arbitrary dilation rates after the dilated - // conv is supported. - /*dilation=*/{{1, 1, 1}}, + {{dilations[0], dilations[1], dilations[2]}}, {{strides[0], strides[1], strides[2]}}, {{padding_planes, padding_rows, padding_cols}}, dtype, @@ -755,6 +826,7 @@ class Conv3DBackpropInputOp : public OpKernel { } private: + std::vector dilation_; std::vector stride_; Padding padding_; TensorFormat data_format_; @@ -784,6 +856,22 @@ class Conv3DBackpropFilterOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); } + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'C') == 1 && + GetTensorDim(dilation_, data_format_, 'N') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(dilation_, data_format_, '0') > 0 && + GetTensorDim(dilation_, data_format_, '1') > 0 && + GetTensorDim(dilation_, data_format_, '2') > 0), + errors::InvalidArgument("Dilated rates should be larger than 0.")); OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); OP_REQUIRES(context, stride_.size() == 5, errors::InvalidArgument("Sliding window strides field must " @@ -794,6 +882,12 @@ class Conv3DBackpropFilterOp : public OpKernel { GetTensorDim(stride_, data_format_, 'N') == 1), errors::InvalidArgument("Current implementation does not yet support " "strides in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(stride_, data_format_, '0') > 0 && + GetTensorDim(stride_, data_format_, '1') > 0 && + GetTensorDim(stride_, data_format_, '2') > 0), + errors::InvalidArgument("Spatial strides should be larger than 0.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); cudnn_use_autotune_ = CudnnUseAutotune(); } @@ -820,6 +914,7 @@ class Conv3DBackpropFilterOp : public OpKernel { OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); if (filter_size[1] == 1 && filter_size[2] == 1 && filter_size[0] == 1 && + dilations[2] == 1 && dilations[1] == 1 && dilations[0] == 1 && strides[2] == 1 && strides[1] == 1 && strides[0] == 1 && data_format_ == FORMAT_NHWC) { const uint64 m = in_depth; @@ -943,7 +1038,10 @@ class Conv3DBackpropFilterOp : public OpKernel { .set_input_feature_map_count(in_depth) .set_output_feature_map_count(out_depth); perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3); - conv_desc.set_filter_stride(DimIndex::X, strides[2]) + conv_desc.set_dilation_rate(DimIndex::X, dilations[2]) + .set_dilation_rate(DimIndex::Y, dilations[1]) + .set_dilation_rate(DimIndex::Z, dilations[0]) + .set_filter_stride(DimIndex::X, strides[2]) .set_filter_stride(DimIndex::Y, strides[1]) .set_filter_stride(DimIndex::Z, strides[0]) .set_zero_padding(DimIndex::X, padding_cols / 2) @@ -1016,7 +1114,7 @@ class Conv3DBackpropFilterOp : public OpKernel { {{input_size[0], input_size[1], input_size[2]}}, out_depth, {{filter_size[0], filter_size[1], filter_size[2]}}, - {{1, 1, 1}}, + {{dilations[0], dilations[1], dilations[2]}}, {{strides[0], strides[1], strides[2]}}, {{padding_planes, padding_rows, padding_cols}}, dtype, @@ -1102,6 +1200,7 @@ class Conv3DBackpropFilterOp : public OpKernel { } private: + std::vector dilation_; std::vector stride_; Padding padding_; TensorFormat data_format_; diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index 0b7c1524e65..48dd3c9eb03 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -49,12 +49,18 @@ template struct LaunchConvOp { static void launch(OpKernelContext* context, bool cudnn_use_autotune, const Tensor& input, const Tensor& filter, + const std::array& dilations, const std::array& strides, const Padding padding, TensorFormat data_format, Tensor* output) { OP_REQUIRES(context, data_format == FORMAT_NHWC, errors::InvalidArgument("CPU implementation of Conv3D " "currently only supports the NHWC " "tensor format.")); + OP_REQUIRES(context, + dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1, + errors::InvalidArgument("CPU implementation of Conv3D " + "currently only supports dilated rates " + "of 1.")); functor::CuboidConvolution()( context->eigen_device(), output->tensor(), input.tensor(), filter.tensor(), strides[2], strides[1], @@ -80,6 +86,28 @@ class Conv3DOp : public BinaryOp { GetTensorDim(stride_, data_format_, 'C') == 1), errors::InvalidArgument("Current implementation does not yet support " "strides in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(stride_, data_format_, '0') > 0 && + GetTensorDim(stride_, data_format_, '1') > 0 && + GetTensorDim(stride_, data_format_, '2') > 0), + errors::InvalidArgument("Spatial strides should be larger than 0.")); + OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_)); + OP_REQUIRES(context, dilation_.size() == 5, + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, + (GetTensorDim(dilation_, data_format_, 'N') == 1 && + GetTensorDim(dilation_, data_format_, 'C') == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "dilation rates in the batch and depth dimensions.")); + OP_REQUIRES( + context, + (GetTensorDim(dilation_, data_format_, '0') > 0 && + GetTensorDim(dilation_, data_format_, '1') > 0 && + GetTensorDim(dilation_, data_format_, '2') > 0), + errors::InvalidArgument("Dilated rates should be larger than 0.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); cudnn_use_autotune_ = CudnnUseAutotune(); } @@ -115,13 +143,18 @@ class Conv3DOp : public BinaryOp { GetTensorDim(input, data_format_, '2')}}; std::array filter_size = { {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}}; + std::array dilations = { + {GetTensorDim(dilation_, data_format_, '0'), + GetTensorDim(dilation_, data_format_, '1'), + GetTensorDim(dilation_, data_format_, '2')}}; std::array strides = {{GetTensorDim(stride_, data_format_, '0'), GetTensorDim(stride_, data_format_, '1'), GetTensorDim(stride_, data_format_, '2')}}; std::array out, padding; - OP_REQUIRES_OK(context, Get3dOutputSize(input_size, filter_size, strides, - padding_, &out, &padding)); + OP_REQUIRES_OK( + context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides, + padding_, &out, &padding)); TensorShape out_shape = ShapeFromFormat( data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth); Tensor* output; @@ -131,10 +164,12 @@ class Conv3DOp : public BinaryOp { if (out_shape.num_elements() == 0) return; LaunchConvOp::launch(context, cudnn_use_autotune_, input, filter, - strides, padding_, data_format_, output); + dilations, strides, padding_, data_format_, + output); } private: + std::vector dilation_; std::vector stride_; Padding padding_; TensorFormat data_format_; @@ -165,6 +200,7 @@ template struct LaunchConvOp { static void launch(OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param, const Tensor& filter, + const std::array& dilations, const std::array& strides, const Padding padding, TensorFormat data_format, Tensor* output) { auto* stream = ctx->op_device_context()->stream(); @@ -199,6 +235,7 @@ struct LaunchConvOp { // NOTE: This only works in NHWC. if (filter_planes == 1 && filter_rows == 1 && filter_cols == 1 && + dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 && strides[2] == 1 && data_format == FORMAT_NHWC) { // 1x1 filter, so call cublas directly. @@ -330,7 +367,10 @@ struct LaunchConvOp { .set_input_feature_map_count(in_depth) .set_output_feature_map_count(out_depth); perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3); - conv_desc.set_filter_stride(DimIndex::X, strides[2]) + conv_desc.set_dilation_rate(DimIndex::X, dilations[2]) + .set_dilation_rate(DimIndex::Y, dilations[1]) + .set_dilation_rate(DimIndex::Z, dilations[0]) + .set_filter_stride(DimIndex::X, strides[2]) .set_filter_stride(DimIndex::Y, strides[1]) .set_filter_stride(DimIndex::Z, strides[0]) .set_zero_padding(DimIndex::X, pad_cols / 2) @@ -377,9 +417,7 @@ struct LaunchConvOp { {{in_planes, in_rows, in_cols}}, out_depth, {{filter_planes, filter_rows, filter_cols}}, - // TODO(yangzihao): Send in arbitrary dilation rates after the dilated - // conv is supported. - /*dilation=*/{{1, 1, 1}}, + {{dilations[0], dilations[1], dilations[2]}}, {{strides[0], strides[1], strides[2]}}, {{pad_planes, pad_rows, pad_cols}}, dtype, diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 12d6dc5eaf2..6dc3d9df310 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -524,6 +524,7 @@ REGISTER_OP("Conv3DBackpropInput") .Attr("strides: list(int) >= 5") .Attr(GetPaddingAttrString()) .Deprecated(10, "Use Conv3DBackpropInputV2") + .Attr("dilations: list(int) = [1, 1, 1, 1, 1]") .SetShapeFn([](InferenceContext* c) { return UnchangedShapeWithRank(c, 5); }); @@ -537,6 +538,7 @@ REGISTER_OP("Conv3DBackpropFilter") .Attr("strides: list(int) >= 5") .Attr(GetPaddingAttrString()) .Deprecated(10, "Use Conv3DBackpropFilterV2") + .Attr("dilations: list(int) = [1, 1, 1, 1, 1]") .SetShapeFn([](InferenceContext* c) { ShapeHandle out; TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &out)); diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py index f4616fd661f..0b531125f36 100644 --- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py +++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py @@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import test_util from tensorflow.python.ops import gradient_checker +from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import nn_ops import tensorflow.python.ops.nn_grad # pylint: disable=unused-import from tensorflow.python.platform import test @@ -61,18 +62,18 @@ class Conv3DTest(test.TestCase): def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride, padding, data_format, dtype, use_gpu): - total_size_1 = 1 - total_size_2 = 1 + total_size_tensor = 1 + total_size_filter = 1 for s in tensor_in_sizes: - total_size_1 *= s + total_size_tensor *= s for s in filter_in_sizes: - total_size_2 *= s + total_size_filter *= s # Initializes the input tensor with array containing numbers from 0 to 1. # We keep the input tensor values fairly small to avoid overflowing float16 # during the conv3d. - x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)] - x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)] + x1 = [f * 1.0 / total_size_tensor for f in range(1, total_size_tensor + 1)] + x2 = [f * 1.0 / total_size_filter for f in range(1, total_size_filter + 1)] with self.test_session(use_gpu=use_gpu): t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype) t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype) @@ -118,6 +119,79 @@ class Conv3DTest(test.TestCase): self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol) + def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes, + stride, dilation, padding, data_format, + use_gpu): + total_size_tensor = 1 + total_size_filter = 1 + for s in tensor_in_sizes: + total_size_tensor *= s + for s in filter_in_sizes: + total_size_filter *= s + + # Initializes the input tensor with array containing incrementing + # numbers from 1. + x1 = [f * 1.0 for f in range(1, total_size_tensor + 1)] + x2 = [f * 1.0 for f in range(1, total_size_filter + 1)] + with self.test_session(use_gpu=use_gpu): + t1 = constant_op.constant(x1, shape=tensor_in_sizes) + t2 = constant_op.constant(x2, shape=filter_in_sizes) + if isinstance(stride, collections.Iterable): + strides = list(stride) + else: + strides = [stride, stride, stride] + if data_format == "NCDHW": + t1 = test_util.NHWCToNCHW(t1) + full_strides = [1, 1] + strides + full_dilation = [1, 1] + dilation + else: + full_strides = [1] + strides + [1] + full_dilation = [1] + dilation + [1] + expected = nn_ops.convolution( + t1, + t2, + padding=padding, + strides=strides, + dilation_rate=dilation, + data_format=data_format) + computed = nn_ops.conv3d( + t1, + t2, + strides=full_strides, + dilations=full_dilation, + padding=padding, + data_format=data_format) + if data_format == "NCDHW": + expected = test_util.NCHWToNHWC(expected) + computed = test_util.NCHWToNHWC(computed) + return expected, computed + + def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, stride, + padding, dilations): + expected_results = [] + computed_results = [] + default_dilations = ( + dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1) + for data_format, use_gpu in GetTestConfigs(): + # If any dilation rate is larger than 1, only do test on the GPU + # because we currently do not have a CPU implementation for arbitrary + # dilation rates. + if default_dilations or use_gpu: + expected, computed = self._ComputeReferenceDilatedConv( + tensor_in_sizes, filter_in_sizes, stride, dilations, padding, + data_format, use_gpu) + expected_results.append(expected) + computed_results.append(computed) + tolerance = 1e-2 if use_gpu else 1e-5 + with self.test_session() as sess: + expected_values = sess.run(expected_results) + computed_values = sess.run(computed_results) + for e_value, c_value in zip(expected_values, computed_values): + print("expected = ", e_value) + print("actual = ", c_value) + self.assertAllClose( + e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6) + def testConv3D1x1x1Filter(self): expected_output = [ 0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259, @@ -145,6 +219,15 @@ class Conv3DTest(test.TestCase): padding="VALID", expected=expected_output) + def testConv3D1x1x1Filter2x1x1Dilation(self): + if test.is_gpu_available(cuda_only=True): + self._VerifyDilatedConvValues( + tensor_in_sizes=[1, 3, 6, 1, 1], + filter_in_sizes=[1, 1, 1, 1, 1], + stride=1, + padding="VALID", + dilations=[2, 1, 1]) + # Expected values computed using scipy's correlate function. def testConv3D2x2x2Filter(self): expected_output = [ @@ -161,6 +244,15 @@ class Conv3DTest(test.TestCase): padding="VALID", expected=expected_output) + def testConv3D2x2x2Filter1x2x1Dilation(self): + if test.is_gpu_available(cuda_only=True): + self._VerifyDilatedConvValues( + tensor_in_sizes=[1, 4, 6, 3, 1], + filter_in_sizes=[2, 2, 2, 1, 1], + stride=1, + padding="VALID", + dilations=[1, 2, 1]) + def testConv3DStrides(self): expected_output = [ 0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095, @@ -546,6 +638,98 @@ class Conv3DTest(test.TestCase): padding="SAME", test_input=False) + # Testing for backprops + def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes, + strides, dilations, padding, data_format, use_gpu, + err, mode): + total_input_size = 1 + total_filter_size = 1 + for s in input_sizes: + total_input_size *= s + for s in filter_sizes: + total_filter_size *= s + # Initializes the input tensor with array containing incrementing + # numbers from 1. + x1 = [f * 1.0 for f in range(1, total_input_size + 1)] + x2 = [f * 1.0 for f in range(1, total_filter_size + 1)] + default_dilations = ( + dilations[0] == 1 and dilations[1] == 1 and dilations[2] == 1) + + # If any dilation rate is larger than 1, only do test on the GPU + # because we currently do not have a CPU implementation for arbitrary + # dilation rates. + if default_dilations or use_gpu: + with self.test_session(use_gpu=use_gpu) as sess: + if data_format == "NCDHW": + input_sizes = test_util.NHWCToNCHW(input_sizes) + t1 = constant_op.constant(x1, shape=input_sizes) + t2 = constant_op.constant(x2, shape=filter_sizes) + full_strides = [1] + strides + [1] + full_dilations = [1] + dilations + [1] + if data_format == "NCDHW": + full_strides = test_util.NHWCToNCHW(full_strides) + full_dilations = test_util.NHWCToNCHW(full_dilations) + actual = nn_ops.conv3d( + t1, + t2, + strides=full_strides, + dilations=full_dilations, + padding=padding, + data_format=data_format) + expected = nn_ops.convolution( + t1, + t2, + padding=padding, + strides=strides, + dilation_rate=dilations, + data_format=data_format) + if data_format == "NCDHW": + actual = test_util.NCHWToNHWC(actual) + expected = test_util.NCHWToNHWC(expected) + actual_grad = gradients_impl.gradients(actual, t1 + if mode == "input" else t2)[0] + expected_grad = gradients_impl.gradients(expected, t1 + if mode == "input" else t2)[0] + # "values" consists of two tensors for two backprops + actual_value = sess.run(actual_grad) + expected_value = sess.run(expected_grad) + self.assertShapeEqual(actual_value, actual_grad) + self.assertShapeEqual(expected_value, expected_grad) + print("expected = ", expected_value) + print("actual = ", actual_value) + self.assertArrayNear(expected_value.flatten(), actual_value.flatten(), + err) + + def testConv3D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self): + if test.is_gpu_available(cuda_only=True): + for (data_format, use_gpu) in GetTestConfigs(): + self._RunAndVerifyBackprop( + input_sizes=[1, 3, 6, 1, 1], + filter_sizes=[2, 2, 1, 1, 1], + output_sizes=[1, 1, 5, 1, 1], + strides=[1, 1, 1], + dilations=[2, 1, 1], + padding="VALID", + data_format=data_format, + use_gpu=use_gpu, + err=1e-5, + mode="filter") + + def testConv3D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self): + if test.is_gpu_available(cuda_only=True): + for (data_format, use_gpu) in GetTestConfigs(): + self._RunAndVerifyBackprop( + input_sizes=[1, 3, 6, 1, 1], + filter_sizes=[2, 2, 1, 1, 1], + output_sizes=[1, 1, 5, 1, 1], + strides=[1, 1, 1], + dilations=[2, 1, 1], + padding="VALID", + data_format=data_format, + use_gpu=use_gpu, + err=1e-5, + mode="input") + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index 4af5bd26dd8..3a41391340e 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -94,6 +94,7 @@ def _Conv3DGrad(op, grad): array_ops.shape(op.inputs[0]), op.inputs[1], grad, + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), @@ -101,6 +102,7 @@ def _Conv3DGrad(op, grad): op.inputs[0], array_ops.shape(op.inputs[1]), grad, + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format) @@ -116,12 +118,14 @@ def _Conv3DBackpropInputGrad(op, grad): grad, array_ops.shape(op.inputs[1]), op.inputs[2], + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), nn_ops.conv3d( grad, op.inputs[1], + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format) @@ -136,12 +140,14 @@ def _Conv3DBackpropFilterGrad(op, grad): array_ops.shape(op.inputs[0]), grad, op.inputs[2], + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format), None, nn_ops.conv3d( op.inputs[0], grad, + dilations=op.get_attr("dilations"), strides=op.get_attr("strides"), padding=op.get_attr("padding"), data_format=data_format) From a175841eb549f069ac205fb32bf55314a387fe6d Mon Sep 17 00:00:00 2001 From: jinghuangintel Date: Fri, 20 Apr 2018 12:20:00 -0700 Subject: [PATCH 0534/1734] [INTEL MKLDNN]: Upgrade mkldnn version to v13 (#18508) * upgrade mkldnn version to v13 * upgrade mkldnn version to v13 for all platforms --- tensorflow/workspace.bzl | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index c58ef87338c..f0a81f77545 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -50,31 +50,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""): mkl_repository( name = "mkl_linux", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz", - "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz", + "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz", ], - sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f", - strip_prefix = "mklml_lnx_2018.0.1.20171227", + sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146", + strip_prefix = "mklml_lnx_2018.0.2.20180127", build_file = clean_dep("//third_party/mkl:mkl.BUILD") ) mkl_repository( name = "mkl_windows", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip", - "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip" + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip", + "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip" ], - sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4", - strip_prefix = "mklml_win_2018.0.1.20171227", + sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded", + strip_prefix = "mklml_win_2018.0.2.20180127", build_file = clean_dep("//third_party/mkl:mkl.BUILD") ) mkl_repository( name = "mkl_darwin", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz", - "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz" + "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz", + "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz" ], - sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f", - strip_prefix = "mklml_mac_2018.0.1.20171227", + sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943", + strip_prefix = "mklml_mac_2018.0.2.20180127", build_file = clean_dep("//third_party/mkl:mkl.BUILD") ) @@ -85,11 +85,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "mkl_dnn", urls = [ - "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.12.tar.gz", - "https://github.com/intel/mkl-dnn/archive/v0.12.tar.gz", + "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz", + "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz", ], - sha256 = "86fa2a8c12a56e3b725945acedeaa82492746be02545aba6d710f097e013e19e", - strip_prefix = "mkl-dnn-0.12", + sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f", + strip_prefix = "mkl-dnn-0.13", build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"), ) From b23e91d247368f2046dae035b5c7bdda56512077 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 12:37:39 -0700 Subject: [PATCH 0535/1734] Changed tf_to_tflite build rule. PiperOrigin-RevId: 193707628 --- tensorflow/contrib/lite/build_def.bzl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index b8f6b7fd59a..85216776823 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out): out: name of the output flatbuffer file. """ - toco = "//tensorflow/contrib/lite/toco:toco" + toco_cmdline = " ".join([ + "//tensorflow/contrib/lite/toco:toco", + "--input_format=TENSORFLOW_GRAPHDEF", + "--output_format=TFLITE", + ("--input_file=$(location %s)" % src), + ("--output_file=$(location %s)" % out), + ] + options ) native.genrule( name = name, - srcs=[src, options], + srcs=[src], outs=[out], - cmd = ("$(location %s) " + - " --input_file=$(location %s) " + - " --output_file=$(location %s) " + - " --input_format=TENSORFLOW_GRAPHDEF" + - " --output_format=TFLITE" + - " `cat $(location %s)`") - % (toco, src, out, options), - tools= [toco], + cmd = toco_cmdline, + tools= ["//tensorflow/contrib/lite/toco:toco"], ) def tflite_to_json(name, src, out): From 517d1912f4ec71180944320350a3694332a1dedc Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Fri, 20 Apr 2018 12:40:57 -0700 Subject: [PATCH 0536/1734] Add a utility to visualize object-based checkpoints Useful for generating a warm fuzzy feeling that everything you think should be saved was saved, and for explaining what object-based checkpointing is. (Also useful on the former front will be a planned "assert that all of this Graph's trainable variables are accessible from object X" function.) Somewhat hacky since it generates strings rather than using the pydot bindings (and so works without a pydot dependency). PiperOrigin-RevId: 193708003 --- tensorflow/contrib/BUILD | 1 + tensorflow/contrib/checkpoint/__init__.py | 3 + tensorflow/contrib/checkpoint/python/BUILD | 32 +++++ .../contrib/checkpoint/python/visualize.py | 111 ++++++++++++++++++ .../checkpoint/python/visualize_test.py | 97 +++++++++++++++ 5 files changed, 244 insertions(+) create mode 100644 tensorflow/contrib/checkpoint/python/visualize.py create mode 100644 tensorflow/contrib/checkpoint/python/visualize_test.py diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 7e475165500..d28392a62c2 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -25,6 +25,7 @@ py_library( "//tensorflow/contrib/batching:batch_py", "//tensorflow/contrib/bayesflow:bayesflow_py", "//tensorflow/contrib/boosted_trees:init_py", + "//tensorflow/contrib/checkpoint/python:checkpoint", "//tensorflow/contrib/cloud:cloud_py", "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/cluster_resolver:cluster_resolver_py", diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py index 70d7d2d8d79..1192cc44a17 100644 --- a/tensorflow/contrib/checkpoint/__init__.py +++ b/tensorflow/contrib/checkpoint/__init__.py @@ -16,6 +16,7 @@ For creating and managing dependencies: +@@dot_graph_from_checkpoint @@split_dependency """ @@ -24,6 +25,8 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency +from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint + from tensorflow.python.util.all_util import remove_undocumented remove_undocumented(module_name=__name__) diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD index d57b01aab26..a5681ffa61d 100644 --- a/tensorflow/contrib/checkpoint/python/BUILD +++ b/tensorflow/contrib/checkpoint/python/BUILD @@ -4,6 +4,15 @@ package(default_visibility = ["//tensorflow:internal"]) load("//tensorflow:tensorflow.bzl", "py_test") +py_library( + name = "checkpoint", + srcs_version = "PY2AND3", + deps = [ + ":split_dependency", + ":visualize", + ], +) + py_library( name = "split_dependency", srcs = ["split_dependency.py"], @@ -27,3 +36,26 @@ py_test( "//tensorflow/python/eager:test", ], ) + +py_library( + name = "visualize", + srcs = ["visualize.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/python:pywrap_tensorflow", + ], +) + +py_test( + name = "visualize_test", + srcs = ["visualize_test.py"], + deps = [ + ":visualize", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:test", + ], +) diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py new file mode 100644 index 00000000000..86fbdb41d2c --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize.py @@ -0,0 +1,111 @@ +"""Utilities for visualizing dependency graphs.""" +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.core.protobuf import checkpointable_object_graph_pb2 +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.framework import errors_impl +from tensorflow.python.training import checkpointable + + +def dot_graph_from_checkpoint(save_path): + r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`). + + Useful for inspecting checkpoints and debugging loading issues. + + Example usage from Python (requires pydot): + ```python + import tensorflow as tf + import pydot + + dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt') + parsed, = pydot.graph_from_dot_data(dot_string) + parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg') + ``` + + Example command line usage: + ```sh + python -c "import tensorflow as tf;\ + print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\ + | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg + ``` + + Args: + save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save` + or `tf.train.latest_checkpoint`. + Returns: + A graph in DOT format as a string. + """ + reader = pywrap_tensorflow.NewCheckpointReader(save_path) + try: + object_graph_string = reader.get_tensor( + checkpointable.OBJECT_GRAPH_PROTO_KEY) + except errors_impl.NotFoundError: + raise ValueError( + ('The specified checkpoint "%s" does not appear to be object-based (it ' + 'is missing the key "%s"). Likely it was created with a name-based ' + 'saver and does not contain an object dependency graph.') % ( + save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY)) + shape_map = reader.get_variable_to_shape_map() + dtype_map = reader.get_variable_to_dtype_map() + object_graph = ( + checkpointable_object_graph_pb2.CheckpointableObjectGraph()) + object_graph.ParseFromString(object_graph_string) + graph = 'digraph {\n' + def _escape(name): + return name.replace('"', '\\"') + slot_ids = set() + for node in object_graph.nodes: + for slot_reference in node.slot_variables: + slot_ids.add(slot_reference.slot_variable_node_id) + for node_id, node in enumerate(object_graph.nodes): + if (len(node.attributes) == 1 + and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY): + if node_id in slot_ids: + color = 'orange' + tooltip_prefix = 'Slot variable' + else: + color = 'blue' + tooltip_prefix = 'Variable' + attribute = node.attributes[0] + graph += ('N_%d [shape=point label="" color=%s width=.25' + ' tooltip="%s %s shape=%s %s"]\n') % ( + node_id, + color, + tooltip_prefix, + _escape(attribute.full_name), + shape_map[attribute.checkpoint_key], + dtype_map[attribute.checkpoint_key].name) + elif node.slot_variables: + graph += ('N_%d [shape=point label="" width=.25 color=red,' + 'tooltip="Optimizer"]\n') % node_id + else: + graph += 'N_%d [shape=point label="" width=.25]\n' % node_id + for reference in node.children: + graph += 'N_%d -> N_%d [label="%s"]\n' % ( + node_id, reference.node_id, _escape(reference.local_name)) + for slot_reference in node.slot_variables: + graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % ( + node_id, + slot_reference.slot_variable_node_id, + _escape(slot_reference.slot_name)) + graph += 'N_%d -> N_%d [style=dotted]\n' % ( + slot_reference.original_variable_node_id, + slot_reference.slot_variable_node_id) + graph += '}\n' + return graph diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py new file mode 100644 index 00000000000..1d9ab789235 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize_test.py @@ -0,0 +1,97 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +from tensorflow.contrib.checkpoint.python import visualize + +from tensorflow.python.eager import context +from tensorflow.python.eager import test +from tensorflow.python.framework import constant_op +from tensorflow.python.keras._impl.keras.engine import training +from tensorflow.python.keras._impl.keras.layers import core +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.training import adam +from tensorflow.python.training import checkpointable_utils + +try: + import pydot # pylint: disable=g-import-not-at-top +except ImportError: + pydot = None + + +class MyModel(training.Model): + """A concrete Model for testing.""" + + def __init__(self): + super(MyModel, self).__init__() + self._named_dense = core.Dense(1, use_bias=True) + self._second = core.Dense(1, use_bias=False) + + def call(self, values): + ret = self._second(self._named_dense(values)) + return ret + + +class DotGraphTests(test.TestCase): + + def testMakeDotGraph(self): + with context.eager_mode(): + input_value = constant_op.constant([[3.]]) + model = MyModel() + optimizer = adam.AdamOptimizer(0.001) + optimizer_step = resource_variable_ops.ResourceVariable(12) + save_checkpoint = checkpointable_utils.Checkpoint( + optimizer=optimizer, model=model, optimizer_step=optimizer_step) + optimizer.minimize(functools.partial(model, input_value)) + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt') + save_path = save_checkpoint.save(checkpoint_prefix) + prefix = save_checkpoint.save(save_path) + + dot_graph_string = visualize.dot_graph_from_checkpoint(prefix) + + # The remainder of this test is more-or-less optional since it's so + # dependent on pydot/platform/Python versions. + if pydot is None: + self.skipTest('pydot is required for the remainder of this test.') + try: + parsed, = pydot.graph_from_dot_data(dot_graph_string) + except NameError as e: + if "name 'dot_parser' is not defined" in str(e): + self.skipTest("pydot isn't working") + else: + raise + # Check that the graph isn't completely trivial + self.assertEqual( + '"model"', + parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label']) + image_path = os.path.join(self.get_temp_dir(), 'saved.svg') + try: + parsed.write_svg(image_path) + except Exception as e: # pylint: disable=broad-except + # For some reason PyDot's "dot not available" error is an Exception, not + # something more specific. + if '"dot" not found in path' in str(e): + self.skipTest("pydot won't save SVGs (dot not available)") + else: + raise + +if __name__ == '__main__': + test.main() From fc6510b506731bf2ffc2520e30fba73b79e5b687 Mon Sep 17 00:00:00 2001 From: Chris Ying Date: Tue, 17 Apr 2018 15:28:12 -0700 Subject: [PATCH 0537/1734] Fix CheckpointSaverHook to properly save every save_checkpoints_steps for TPU workloads. PiperOrigin-RevId: 193266515 (cherry picked from commit 5aba07dce5b9e924183efcd05cd82f2fbb70edc8) --- .../contrib/tpu/python/tpu/tpu_estimator.py | 9 ++ .../training/basic_session_run_hooks.py | 10 +- .../training/basic_session_run_hooks_test.py | 93 +++++++++++++++++++ ...sorflow.train.-checkpoint-saver-hook.pbtxt | 2 +- 4 files changed, 111 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 1332108d04c..c8c4cc6c685 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -2054,6 +2054,14 @@ class TPUEstimator(estimator_lib.Estimator): }, every_n_secs=30) ] + input_hooks + chief_hooks = [ + training.CheckpointSaverHook( + self.model_dir, + save_secs=self._config.save_checkpoints_secs, + save_steps=self._config.save_checkpoints_steps, + steps_per_run=self._config.tpu_config.iterations_per_loop, + scaffold=scaffold) + ] summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss) with ops.control_dependencies([loss]): update_ops = _sync_variables_ops() @@ -2067,6 +2075,7 @@ class TPUEstimator(estimator_lib.Estimator): return model_fn_lib.EstimatorSpec( mode, loss=loss, + training_chief_hooks=chief_hooks, training_hooks=hooks, train_op=train_op, scaffold=scaffold) diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py index 094a9e886ba..3651291bdfc 100644 --- a/tensorflow/python/training/basic_session_run_hooks.py +++ b/tensorflow/python/training/basic_session_run_hooks.py @@ -391,7 +391,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): saver=None, checkpoint_basename="model.ckpt", scaffold=None, - listeners=None): + listeners=None, + steps_per_run=1): """Initializes a `CheckpointSaverHook`. Args: @@ -404,6 +405,9 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. + steps_per_run: `int`, number of steps that occur between each invocation + of the hook. Primarily used for TPU workloads which run multiple steps + in a while loop in a single Session.run. Raises: ValueError: One of `save_steps` or `save_secs` should be set. @@ -419,6 +423,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): self._timer = SecondOrStepTimer(every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] + self._steps_per_run = steps_per_run def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) @@ -450,7 +455,8 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): def after_run(self, run_context, run_values): stale_global_step = run_values.results - if self._timer.should_trigger_for_step(stale_global_step+1): + if self._timer.should_trigger_for_step( + stale_global_step + self._steps_per_run): # get the real value after train op. global_step = run_context.session.run(self._global_step_tensor) if self._timer.should_trigger_for_step(global_step): diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py index f39a5261a93..25962f6bf7a 100644 --- a/tensorflow/python/training/basic_session_run_hooks_test.py +++ b/tensorflow/python/training/basic_session_run_hooks_test.py @@ -719,6 +719,99 @@ class CheckpointSaverHookTest(test.TestCase): fake_summary_writer.FakeSummaryWriter.uninstall() +class CheckpointSaverHookMultiStepTest(test.TestCase): + + def setUp(self): + self.model_dir = tempfile.mkdtemp() + self.graph = ops.Graph() + self.steps_per_run = 5 + with self.graph.as_default(): + self.scaffold = monitored_session.Scaffold() + self.global_step = variables.get_or_create_global_step() + self.train_op = training_util._increment_global_step(self.steps_per_run) + + def tearDown(self): + shutil.rmtree(self.model_dir, ignore_errors=True) + + def test_save_steps_saves_in_first_step(self): + with self.graph.as_default(): + hook = basic_session_run_hooks.CheckpointSaverHook( + self.model_dir, + save_steps=2*self.steps_per_run, + scaffold=self.scaffold, + steps_per_run=self.steps_per_run) + hook.begin() + self.scaffold.finalize() + with session_lib.Session() as sess: + sess.run(self.scaffold.init_op) + mon_sess = monitored_session._HookedSession(sess, [hook]) + mon_sess.run(self.train_op) + self.assertEqual(5, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + def test_save_steps_saves_periodically(self): + with self.graph.as_default(): + hook = basic_session_run_hooks.CheckpointSaverHook( + self.model_dir, + save_steps=2*self.steps_per_run, + scaffold=self.scaffold, + steps_per_run=self.steps_per_run) + hook.begin() + self.scaffold.finalize() + with session_lib.Session() as sess: + sess.run(self.scaffold.init_op) + mon_sess = monitored_session._HookedSession(sess, [hook]) + mon_sess.run(self.train_op) + # Saved (step=5) + self.assertEqual(5, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + mon_sess.run(self.train_op) + # Not saved (step=10) + self.assertEqual(5, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + mon_sess.run(self.train_op) + # Saved (step=15) + self.assertEqual(15, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + mon_sess.run(self.train_op) + # Not saved (step=20) + self.assertEqual(15, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + mon_sess.run(self.train_op) + # Saved (step=25) + self.assertEqual(25, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + def test_save_steps_saves_at_end(self): + with self.graph.as_default(): + hook = basic_session_run_hooks.CheckpointSaverHook( + self.model_dir, + save_steps=2*self.steps_per_run, + scaffold=self.scaffold, + steps_per_run=self.steps_per_run) + hook.begin() + self.scaffold.finalize() + with session_lib.Session() as sess: + sess.run(self.scaffold.init_op) + mon_sess = monitored_session._HookedSession(sess, [hook]) + mon_sess.run(self.train_op) + mon_sess.run(self.train_op) + hook.end(sess) + self.assertEqual(10, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + + class ResourceCheckpointSaverHookTest(test.TestCase): def setUp(self): diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt index c3037baa8c9..327799729c9 100644 --- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt @@ -5,7 +5,7 @@ tf_class { is_instance: "" member_method { name: "__init__" - argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\', \'steps_per_run\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\', \'1\'], " } member_method { name: "after_create_session" From e1cc34d34b3a811da7c7a2d7cc6c60398c50fdfb Mon Sep 17 00:00:00 2001 From: Chris Ying Date: Tue, 17 Apr 2018 20:31:30 -0700 Subject: [PATCH 0538/1734] Disable CheckpointSaverHook when both save_checkpoints_secs and save_checkpoints_steps are None PiperOrigin-RevId: 193299688 (cherry picked from commit 41e2cd187b31e9e6d88bc042e21e73f7be0ed729) --- .../contrib/tpu/python/tpu/tpu_estimator.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index c8c4cc6c685..8df631b475e 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -2054,14 +2054,16 @@ class TPUEstimator(estimator_lib.Estimator): }, every_n_secs=30) ] + input_hooks - chief_hooks = [ - training.CheckpointSaverHook( - self.model_dir, - save_secs=self._config.save_checkpoints_secs, - save_steps=self._config.save_checkpoints_steps, - steps_per_run=self._config.tpu_config.iterations_per_loop, - scaffold=scaffold) - ] + chief_hooks = [] + if (self._config.save_checkpoints_secs or + self._config.save_checkpoints_steps): + chief_hooks.append( + training.CheckpointSaverHook( + self.model_dir, + save_secs=self._config.save_checkpoints_secs, + save_steps=self._config.save_checkpoints_steps, + steps_per_run=self._config.tpu_config.iterations_per_loop, + scaffold=scaffold)) summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss) with ops.control_dependencies([loss]): update_ops = _sync_variables_ops() From 0b6ca72332735fe460da23fbcca5c8c24d838f28 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 13:18:02 -0700 Subject: [PATCH 0539/1734] Update ops-related pbtxt files. PiperOrigin-RevId: 193712839 --- .../core/ops/compat/ops_history.v1.pbtxt | 124 ++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 26 ++++ 2 files changed, 150 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index dbd6f859c46..247f9edf5b2 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -13445,6 +13445,68 @@ op { version: 10 } } +op { + name: "Conv3DBackpropFilter" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "filter" + type_attr: "T" + } + input_arg { + name: "out_backprop" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + } + } + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 5 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "dilations" + type: "list(int)" + default_value { + list { + i: 1 + i: 1 + i: 1 + i: 1 + i: 1 + } + } + } + deprecation { + version: 10 + } +} op { name: "Conv3DBackpropFilterV2" input_arg { @@ -13718,6 +13780,68 @@ op { version: 10 } } +op { + name: "Conv3DBackpropInput" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "filter" + type_attr: "T" + } + input_arg { + name: "out_backprop" + type_attr: "T" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + } + } + } + attr { + name: "strides" + type: "list(int)" + has_minimum: true + minimum: 5 + } + attr { + name: "padding" + type: "string" + allowed_values { + list { + s: "SAME" + s: "VALID" + } + } + } + attr { + name: "dilations" + type: "list(int)" + default_value { + list { + i: 1 + i: 1 + i: 1 + i: 1 + i: 1 + } + } + } + deprecation { + version: 10 + } +} op { name: "Conv3DBackpropInputV2" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 46afe357f06..d1773daebe4 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -5651,6 +5651,19 @@ op { } } } + attr { + name: "dilations" + type: "list(int)" + default_value { + list { + i: 1 + i: 1 + i: 1 + i: 1 + i: 1 + } + } + } deprecation { version: 10 explanation: "Use Conv3DBackpropFilterV2" @@ -5774,6 +5787,19 @@ op { } } } + attr { + name: "dilations" + type: "list(int)" + default_value { + list { + i: 1 + i: 1 + i: 1 + i: 1 + i: 1 + } + } + } deprecation { version: 10 explanation: "Use Conv3DBackpropInputV2" From 02075fa2456d951ff3b7bdb8fee76a1b9c6d8716 Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Fri, 20 Apr 2018 13:43:06 -0700 Subject: [PATCH 0540/1734] MKLDNN: conv2d forward DNN primitive reuse enhancement (#17943) * Enable conv2d fwd primitive reuse * coding style change based on suggestions from TF team * minor code style fix * refactor conv2d primitive reuse class and enhance key creation utility * refactor by introducing ConvFwdDimensions structure * change 'Execute' method to be a template one per PR review suggestion * Per PR review suggestion, update DnnOp class to declared related method as abstract ones * refactor AddAsKey method - template for scalar value and remove Execute()which is not used yet * rename padding_l/_r/pl/pr to padding_left or padding_right as recommended * parameter and variable renaming - to make them more explicit --- tensorflow/core/kernels/mkl_conv_ops.cc | 414 +++++++++++++++++------- tensorflow/core/util/mkl_util.h | 87 ++++- 2 files changed, 389 insertions(+), 112 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index f0818eb96da..f2b14f12789 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include #include +#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -42,14 +43,13 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" #ifndef INTEL_MKL_ML - #include "mkldnn.hpp" using mkldnn::prop_kind; using mkldnn::stream; - -using mkldnn::convolution_direct; using mkldnn::convolution_forward; +using mkldnn::convolution_direct; + #else #include "mkl_dnn.h" #include "mkl_dnn_types.h" @@ -57,11 +57,232 @@ using mkldnn::convolution_forward; namespace tensorflow { +#ifndef INTEL_MKL_ML + +struct ConvFwdDimensions { + memory::dims src_dims; + memory::dims filter_dims; + memory::dims bias_dims; + memory::dims dst_dims; + memory::dims strides; + memory::dims dilations; + memory::dims padding_left; + memory::dims padding_right; + + ConvFwdDimensions(memory::dims src_dims, + memory::dims filter_dims, memory::dims bias_dims, + memory::dims dst_dims, memory::dims strides, + memory::dims dilations, memory::dims padding_left, + memory::dims padding_right) : + src_dims(src_dims), filter_dims(filter_dims), + bias_dims(bias_dims), dst_dims(dst_dims), + strides(strides), dilations(dilations), + padding_left(padding_left), padding_right(padding_right) { + } +}; + +template +class Conv2DFwd : public DnnOp { + public: + explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) { + fwd_stream_.reset(new stream(stream::kind::eager)); + // create conv primitive + if (conv_fwd_ == nullptr) { + Setup(convFwdDims); + } + } + + ~Conv2DFwd() {} + + // Convolution forward execute with bias + // src_data: input data buffer of src + // filter_data: input data buffer of filter (weights) + // bias_data: input data buffer of bias + // dst_data: output data buffer of dst + void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) { + src_mem_->set_data_handle(static_cast(src_data)); + filter_mem_->set_data_handle(static_cast(filter_data)); + bias_mem_->set_data_handle(static_cast(bias_data)); + dst_mem_->set_data_handle(static_cast(dst_data)); + fwd_stream_->submit(fwd_primitives_); + + // after exec, set data handle back + src_mem_->set_data_handle(DummyData); + filter_mem_->set_data_handle(DummyData); + bias_mem_->set_data_handle(DummyData); + dst_mem_->set_data_handle(DummyData); + + return; + } + + // Convolution forward execute without bias + // src_data: input data buffer of src + // filter_data: input data buffer of filter (weights) + // dst_data: output data buffer of dst + void Execute(T* src_data, T* filter_data, T* dst_data) { + src_mem_->set_data_handle(static_cast(src_data)); + filter_mem_->set_data_handle(static_cast(filter_data)); + dst_mem_->set_data_handle(static_cast(dst_data)); + fwd_stream_->submit(fwd_primitives_); + + // after exec, set data handle back + src_mem_->set_data_handle(DummyData); + filter_mem_->set_data_handle(DummyData); + dst_mem_->set_data_handle(DummyData); + + return; + } + + // expected memory format for this primitive instance + memory::format src_fmt_; + memory::format filter_fmt_; + + // convolution primitive + std::shared_ptr fwd_pd_; + std::shared_ptr conv_fwd_; + + private: + void Setup(const ConvFwdDimensions& convFwdDims) { + // create memory descriptors for convolution data w/ no specified format + src_md_.reset(new memory::desc({convFwdDims.src_dims}, + MklDnnType(), memory::format::any)); + + filter_md_.reset(new memory::desc({convFwdDims.filter_dims}, + MklDnnType(), memory::format::any)); + + dst_md_.reset(new memory::desc({convFwdDims.dst_dims}, + MklDnnType(), memory::format::any)); + + if (!convFwdDims.bias_dims.empty()) + bias_md_.reset(new memory::desc({convFwdDims.bias_dims}, + MklDnnType(), memory::format::any)); + + // create a convolution + if (!convFwdDims.bias_dims.empty()) { + fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_, + convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, + convFwdDims.padding_right, padding_kind::zero)); + } else { + fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward, + convolution_direct, *src_md_, *filter_md_, *dst_md_, + convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, + convFwdDims.padding_right, padding_kind::zero)); + } + + fwd_pd_.reset(new convolution_forward::primitive_desc( + *fwd_desc_, cpu_engine_)); + + // store the expected memory format + src_fmt_ = static_cast( + fwd_pd_.get()->src_primitive_desc().desc().data.format); + + filter_fmt_ = static_cast( + fwd_pd_.get()->weights_primitive_desc().desc().data.format); + + // create memory primitive based on dummy data + src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData)); + filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(), + DummyData)); + dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData)); + + // create convolution primitive and add it to net + if (!convFwdDims.bias_dims.empty()) { + bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType(), + memory::format::x}, cpu_engine_}, DummyData)); + conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, + *filter_mem_, *bias_mem_, *dst_mem_)); + } else { + conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_, + *filter_mem_, *dst_mem_)); + } + + fwd_primitives_.push_back(*conv_fwd_); + return; + } + + // MKLDNN memory + std::shared_ptr src_mem_; + std::shared_ptr filter_mem_; + std::shared_ptr bias_mem_; + std::shared_ptr dst_mem_; + + std::shared_ptr fwd_stream_; + std::vector fwd_primitives_; + + // desc & prmitive desc + std::shared_ptr fwd_desc_; + + // memory desc + std::shared_ptr src_md_; + std::shared_ptr filter_md_; + std::shared_ptr bias_md_; + std::shared_ptr dst_md_; + + engine cpu_engine_ = engine(engine::cpu, 0); +}; + +template +class Conv2DFwdFactory : public DnnOpFactory { + public: + static Conv2DFwd* Get(const ConvFwdDimensions& convFwdDims) { + Conv2DFwd* conv2d_fwd = nullptr; + + // try to find a suitable one in pool + conv2d_fwd = dynamic_cast*> ( + Conv2DFwdFactory::GetInstance().GetConv2DFwd(convFwdDims)); + + if (conv2d_fwd == nullptr) { + conv2d_fwd = new Conv2DFwd(convFwdDims); + Conv2DFwdFactory::GetInstance().SetConv2DFwd( + convFwdDims, conv2d_fwd); + } + return conv2d_fwd; + } + + private: + Conv2DFwdFactory() {} + ~Conv2DFwdFactory() {} + + static const int kDilationH = 0, kDilationW = 1; + + static Conv2DFwdFactory& GetInstance() { + static Conv2DFwdFactory instance_; + return instance_; + } + + static std::string CreateKey(const ConvFwdDimensions& convFwdDims) { + std::string prefix = "conv2d_fwd_"; + FactoryKeyCreator key_creator; + key_creator.AddAsKey(prefix); + key_creator.AddAsKey(convFwdDims.src_dims); + key_creator.AddAsKey(convFwdDims.filter_dims); + key_creator.AddAsKey(convFwdDims.bias_dims); + key_creator.AddAsKey(convFwdDims.dst_dims); + key_creator.AddAsKey(convFwdDims.strides); + key_creator.AddAsKey(convFwdDims.dilations); + key_creator.AddAsKey(convFwdDims.padding_left); + key_creator.AddAsKey(convFwdDims.padding_right); + return key_creator.GetKey(); + } + + DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) { + std::string key = CreateKey(convFwdDims); + return this->GetOp(key); + } + + void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) { + std::string key = CreateKey(convFwdDims); + this->SetOp(key, op); + } +}; + +#endif + typedef Eigen::ThreadPoolDevice CPUDevice; -// MKL-DNN is now default. MKL-ML must be specified explicitly. +// For now, MKL-ML is default. So making MKL-DNN not a default choice. #ifdef INTEL_MKL_ML - template class MklConv2DOp : public OpKernel { public: @@ -528,8 +749,6 @@ class MklConv2DOp : public OpKernel { void Compute(OpKernelContext* context) override { try { - auto cpu_engine = engine(engine::cpu, 0); - // Input tensors const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src); const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter); @@ -538,16 +757,16 @@ class MklConv2DOp : public OpKernel { GetMklShape(context, kInputIndex_Src, &src_mkl_shape); GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Filter should not be in " - "Mkl Layout")); + errors::InvalidArgument("Filter should not be in " + "Mkl Layout")); MklDnnData src(&cpu_engine); MklDnnData filter(&cpu_engine); - MklDnnData output(&cpu_engine); + MklDnnData dst(&cpu_engine); // output - memory::dims src_dims, filter_dims, padding_l, padding_r, + memory::dims src_dims, filter_dims, padding_left, padding_right, dilations, strides; - memory::dims output_dims_tf_order, output_dims_mkl_order; + memory::dims dst_dims_tf_order, dst_dims_mkl_order; // Get shapes of input tensors in MKL-DNN order MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_, @@ -555,31 +774,29 @@ class MklConv2DOp : public OpKernel { auto src_tf_shape = GetTfShape(context, kInputIndex_Src); auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); conv_utl.GetConvFwdSizesInMklOrder( - src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, - &dilations, &output_dims_tf_order, &output_dims_mkl_order, - &padding_l, &padding_r); + src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, + &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, + &padding_left, &padding_right); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. - TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order); + TensorShape dst_tf_shape = MklDnnDimsToTFShape(dst_dims_tf_order); // Corner cases: output with 0 elements and 0 batch size. - Tensor* output_tensor = nullptr; - if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) { - // TODO(jbobba): Verify correctness here - // Need semantics for Null MKL tensor - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(false); - - AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor, - src_tf_shape, output_mkl_shape); + Tensor* dst_tensor = nullptr; + if (dst_tf_shape.num_elements() == 0 || + dst_dims_tf_order[0] == 0) { + MklDnnShape dst_mkl_shape; + dst_mkl_shape.SetMklTensor(false); + AllocateOutputSetMklShape(context, kOutputIndex_Dst, + &dst_tensor, src_tf_shape, dst_mkl_shape); // MklConv2D also outputs converted filter as 2nd output of Conv2D. filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; AllocateOutputSetMklShape(context, kOutputIndex_Filter, - &output_filter_tensor, filter_tf_shape, - filter_mkl_shape); + &output_filter_tensor, + filter_tf_shape, filter_mkl_shape); return; } @@ -587,6 +804,7 @@ class MklConv2DOp : public OpKernel { // Describe how the inputs and outputs of Convolution look like. Also // specify buffers containing actual input and output data. auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_); + // If input is in MKL layout, then simply grab input layout; otherwise, // construct input Tf layout. For TF layout, although input shape // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's @@ -595,6 +813,7 @@ class MklConv2DOp : public OpKernel { ? src_mkl_shape.GetMklLayout() : memory::desc(src_dims, MklDnnType(), tf_fmt); src.SetUsrMem(src_md, &src_tensor); + // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). auto filter_md = filter_mkl_shape.IsMklTensor() // Should NEVER be true @@ -603,98 +822,70 @@ class MklConv2DOp : public OpKernel { memory::format::hwio); filter.SetUsrMem(filter_md, &filter_tensor); - // Set output shape (output_dims) required in MKL-DNN order. - // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW - // depending on data format). But later we propagate Mkl layout of the - // output to the next op directly. - output.SetUsrMem(output_dims_mkl_order, tf_fmt); - - // Create memory descriptors for convolution data w/ no specified format. - src.SetOpMemDesc(src_dims, memory::format::any); - filter.SetOpMemDesc(filter_dims, memory::format::any); - output.SetOpMemDesc(output_dims_mkl_order, memory::format::any); - // MKLDNN dilation starts from 0. dilations[kDilationH] -= 1; dilations[kDilationW] -= 1; + // get a conv2d fwd from primitive pool + Conv2DFwd *conv2d_fwd = nullptr; if (biasEnabled) { - // Create convolution primitive with Bias. - MklDnnData bias(&cpu_engine); - memory::dims bias_size; - conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size); - const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias); - bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor); - bias.SetOpMemDesc(bias_size, memory::format::any); - - // Create convolution primitive with Bias. - // Use MKLDNN dilated convolution in case of dilated rate (>0). - auto conv_desc = (dilations[kDilationH] > 0 || - dilations[kDilationW] > 0) ? - convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), - filter.GetOpMemDesc(), bias.GetOpMemDesc(), - output.GetOpMemDesc(), strides, dilations, - padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)): - convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), - filter.GetOpMemDesc(), bias.GetOpMemDesc(), - output.GetOpMemDesc(), strides, - padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); - AllocateOutputTensor(context, conv_prim_desc, - output_dims_mkl_order, tf_fmt, &output_tensor); - // Set data handle for output. - output.SetUsrMemDataHandle(output_tensor); - - Tensor* filter_out_tensor = nullptr; - AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output, - filter_out_tensor); + memory::dims bias_dims = {}; + conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims); + ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims, + dst_dims_mkl_order, strides, dilations, padding_left, padding_right); + conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); } else { - // Create convolution primitive without Bias. - // Use MKLDNN dilated convolution in case of dilated rate (>0). - auto conv_desc = (dilations[kDilationH] > 0 || - dilations[kDilationW] > 0) ? - convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), - filter.GetOpMemDesc(), output.GetOpMemDesc(), - strides, dilations, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)): - convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), - filter.GetOpMemDesc(), output.GetOpMemDesc(), - strides, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); - AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, - tf_fmt, &output_tensor); - // Set data handle for output. - output.SetUsrMemDataHandle(output_tensor); - - Tensor* filter_out_tensor = nullptr; - AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, - nullptr, &output, filter_out_tensor); + ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS, + dst_dims_mkl_order, strides, dilations, padding_left, padding_right); + conv2d_fwd = Conv2DFwdFactory::Get(convFwdDims); } - } catch (mkldnn::error& e) { + + // allocate output tensors output_tensor and filter_out_tensor + std::shared_ptr + conv_fwd_pd = conv2d_fwd->fwd_pd_; + AllocateOutputTensor(context, *conv_fwd_pd, + dst_dims_mkl_order, tf_fmt, &dst_tensor); + Tensor* filter_out_tensor = nullptr; + AllocateFilterOutputTensor(context, *conv_fwd_pd, + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + + T* dst_data = static_cast(dst_tensor->flat().data()); + + // check whether src/filter need reorder + std::vector net; + if (src_md.data.format != conv2d_fwd->src_fmt_) + src.CheckReorderToOpMem( + conv_fwd_pd.get()->src_primitive_desc(), &net); + + if (filter_md.data.format != conv2d_fwd->filter_fmt_) + filter.CheckReorderToOpMem( + conv_fwd_pd.get()->weights_primitive_desc(), + filter.GetTensorBuffer(filter_out_tensor), &net); + stream(stream::kind::eager).submit(net).wait(); + + T* src_data = static_cast( + src.GetOpMem().get_data_handle()); + T* filter_data = static_cast( + filter.GetOpMem().get_data_handle()); + + // execute convolution + if (biasEnabled) { + const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias); + T* bias_data = static_cast(const_cast( + bias_tensor.flat().data())); + + conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data); + } else { + conv2d_fwd->Execute(src_data, filter_data, dst_data); + } + } catch (mkldnn::error &e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + std::string(e.message) + ", in file " + - std::string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK( - context, - errors::Aborted("Operation received an exception:", error_msg)); + ", message: " + std::string(e.message) + + ", in file " + std::string(__FILE__) + ":" + + std::to_string(__LINE__); + OP_REQUIRES_OK(context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -706,6 +897,7 @@ class MklConv2DOp : public OpKernel { const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; + engine cpu_engine = engine(engine::cpu, 0); // Allocate output tensor. void AllocateOutputTensor( diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index bc6d2d77a4d..50a8e305749 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -19,6 +19,8 @@ limitations under the License. #include #include +#include +#include #include "mkl_dnn.h" #include "mkl_dnn_types.h" @@ -1759,7 +1761,90 @@ class MklDnnData { } }; -#endif // INTEL_MKL_ML +/// Base class for operations with reuse of DNN primitives +/// +class DnnOp { + public: + virtual ~DnnOp() {} + + // Dummy data. Its size, hard-coded as 256 here, does + // not matter since MKL should never operate on this buffer. + unsigned char DummyData[256]; +}; + +const mkldnn::memory::dims NONE_DIMS = {}; +// This constant is used to declare dummy buffer (size), for MKL primitives +template +class DnnOpFactory { + public: + DnnOpFactory() {} + ~DnnOpFactory() {} + + DnnOp* GetOp(const std::string& key) { + auto stream_iter = DnnOpFactory::GetHashMap().find(key); + if (stream_iter == DnnOpFactory::GetHashMap().end()) { + return nullptr; + } else { + return stream_iter->second; + } + } + + void SetOp(const std::string& key, DnnOp* op) { + auto stream_iter = DnnOpFactory::GetHashMap().find(key); + + CHECK(stream_iter == DnnOpFactory::GetHashMap().end()); + + DnnOpFactory::GetHashMap()[key] = op; + } + + private: + static inline std::unordered_map &GetHashMap() { + static thread_local std::unordered_map map_; + return map_; + } +}; + +// utility class for creating keys of MKL primitive pool. +class FactoryKeyCreator { + public: + FactoryKeyCreator() { + key_.reserve(kMaxKeyLength); + } + + ~FactoryKeyCreator() {} + + void AddAsKey(const string &str) { + auto buffer = reinterpret_cast(str.c_str()); + Append(buffer, str.length()); + } + + void AddAsKey(const mkldnn::memory::dims &dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AddAsKey(dims[i]); + } + } + + template + void AddAsKey(const T data) { + auto buffer = reinterpret_cast(&data); + Append(buffer, sizeof(T)); + } + + std::string GetKey() { + return key_; + } + + private: + string key_; + const char delimiter = 'x'; + const int kMaxKeyLength = 256; + void Append(const char* data, int len) { + key_.append(data, len); + key_.append(1, delimiter); + } +}; + +#endif // INTEL_MKL_DNN } // namespace tensorflow #endif // INTEL_MKL From 99167d3a6393ac47c2e01b6f620a03adeb9ac3e4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 13:48:37 -0700 Subject: [PATCH 0541/1734] Merged commit includes the following changes: 193717076 by yifeif: Automated g4 rollback of changelist 193713153. -- 193716750 by fchollet: Refactor `tf.keras.layers.Embedding` layer to use `embedding_lookup` instead of `gather`. This makes the layer TPU-compatible. -- 193716664 by A. Unique TensorFlower: Go: Update generated wrapper functions for TensorFlow ops. -- 193713153 by power: Experimental Keras TPU compatibility layer. -- PiperOrigin-RevId: 193717076 --- tensorflow/go/op/wrappers.go | 32 +++++++++++++++++-- tensorflow/python/keras/BUILD | 1 + .../keras/_impl/keras/layers/embeddings.py | 4 +-- .../_impl/keras/layers/embeddings_test.py | 13 ++++++++ 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index 3b3dff0573a..ec7d9dcc4f1 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -5917,6 +5917,17 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) { return op.Output(0) } +// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter. +type Conv3DBackpropFilterAttr func(optionalAttr) + +// Conv3DBackpropFilterDilations sets the optional dilations attribute to value. +// If not specified, defaults to +func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr { + return func(m optionalAttr) { + m["dilations"] = value + } +} + // Computes the gradients of 3-D convolution with respect to the filter. // // DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2 @@ -5930,11 +5941,14 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) { // strides: 1-D tensor of length 5. The stride of the sliding window for each // dimension of `input`. Must have `strides[0] = strides[4] = 1`. // padding: The type of padding algorithm to use. -func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) { +func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) { if scope.Err() != nil { return } attrs := map[string]interface{}{"strides": strides, "padding": padding} + for _, a := range optional { + a(attrs) + } opspec := tf.OpSpec{ Type: "Conv3DBackpropFilter", Input: []tf.Input{ @@ -12306,6 +12320,17 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa return op.Output(0) } +// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput. +type Conv3DBackpropInputAttr func(optionalAttr) + +// Conv3DBackpropInputDilations sets the optional dilations attribute to value. +// If not specified, defaults to +func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr { + return func(m optionalAttr) { + m["dilations"] = value + } +} + // Computes the gradients of 3-D convolution with respect to the input. // // DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2 @@ -12319,11 +12344,14 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa // strides: 1-D tensor of length 5. The stride of the sliding window for each // dimension of `input`. Must have `strides[0] = strides[4] = 1`. // padding: The type of padding algorithm to use. -func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) { +func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) { if scope.Err() != nil { return } attrs := map[string]interface{}{"strides": strides, "padding": padding} + for _, a := range optional { + a(attrs) + } opspec := tf.OpSpec{ Type: "Conv3DBackpropInput", Input: []tf.Input{ diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index 70040b7e740..1c58553156e 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -208,6 +208,7 @@ py_library( "//tensorflow/python:array_ops", "//tensorflow/python:distribute", "//tensorflow/python:dtypes", + "//tensorflow/python:embedding_ops", "//tensorflow/python:framework_ops", "//tensorflow/python:logging_ops", "//tensorflow/python:math_ops", diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py index 591bab7cd86..07b8726b859 100644 --- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py +++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py @@ -24,7 +24,7 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import Layer from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion -from tensorflow.python.ops import array_ops +from tensorflow.python.ops import embedding_ops from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import tf_export @@ -155,7 +155,7 @@ class Embedding(Layer): def call(self, inputs): if K.dtype(inputs) != 'int32': inputs = math_ops.cast(inputs, 'int32') - out = array_ops.gather(self.embeddings, inputs) + out = embedding_ops.embedding_lookup(self.embeddings, inputs) return out def get_config(self): diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py index 9f6793eac85..6ebf5dc94ad 100644 --- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py +++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py @@ -18,6 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np + from tensorflow.python.framework import test_util as tf_test_util from tensorflow.python.keras._impl import keras from tensorflow.python.keras._impl.keras import testing_utils @@ -65,6 +67,17 @@ class EmbeddingTest(test.TestCase): input_dtype='int32', expected_output_dtype='float32') + def test_embedding_correctness(self): + with self.test_session(): + layer = keras.layers.Embedding(output_dim=2, input_dim=2) + layer.build((None, 2)) + matrix = np.array([[1, 1], [2, 2]]) + layer.set_weights([matrix]) + + inputs = keras.backend.constant([[0, 1, 0]], dtype='int32') + outputs = keras.backend.eval(layer(inputs)) + self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]]) + if __name__ == '__main__': test.main() From 5a4356be6822dfe0b0f973852b9b65d69e4c169c Mon Sep 17 00:00:00 2001 From: Brian Patton Date: Fri, 20 Apr 2018 13:54:00 -0700 Subject: [PATCH 0542/1734] Fix for: Suggest braces around initialization of subobject. PiperOrigin-RevId: 193717872 --- tensorflow/python/lib/core/bfloat16.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc index 7f07deebef3..77fa2c1f66d 100644 --- a/tensorflow/python/lib/core/bfloat16.cc +++ b/tensorflow/python/lib/core/bfloat16.cc @@ -616,8 +616,8 @@ bool Initialize() { }; // Comparisons - const std::array compare_types = {npy_bfloat16_, npy_bfloat16_, - NPY_BOOL}; + const std::array compare_types = { + {npy_bfloat16_, npy_bfloat16_, NPY_BOOL}}; if (!register_ufunc("equal", CompareUFunc, compare_types)) { From 1cd64d57143814fc0652c09165735be62d96124f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 13:56:55 -0700 Subject: [PATCH 0543/1734] Track dependencies between outside_compilation clusters so that control edges can be correctly added to sequence compiled computations. PiperOrigin-RevId: 193718295 --- .../jit/encapsulate_subgraphs_pass.cc | 378 ++++++++++- .../jit/encapsulate_subgraphs_pass_test.cc | 590 +++++++++++++++++- tensorflow/compiler/tf2xla/xla_compiler.cc | 25 + tensorflow/compiler/tf2xla/xla_compiler.h | 20 + 4 files changed, 1005 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 9465385b585..7507e193b56 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include "tensorflow/compiler/jit/graph_to_functiondef.h" +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" #include "tensorflow/compiler/jit/shape_inference_helpers.h" @@ -160,6 +161,11 @@ class Encapsulator { std::move(outside_compilation_attribute)), graph_in_(graph_in) {} + // Find dependencies between subgraphs and outside_compilation clusters that + // only manifest via edges between outside_compilation clusters in the outer + // (non-compiled) graph. + Status FindClusterDependencies(); + // Find subgraphs marked with 'group_attribute', and build a new // subgraph, one for each value of 'group_attribute'. Status SplitIntoSubgraphs(); @@ -230,6 +236,19 @@ class Encapsulator { // the shapes of any ancestor RAH outputs. If it can be determined that the // shape of the SFH inputs will not be inferrable even once the shapes of the // RAH outputs are known, an error is returned by the rewriter. + // + // Once edges between compiled and outside_compilation clusters have been + // replaced by send/recv ops, some dependencies may no longer be apparent. + // A clustering pass finds all the dependencies between HC nodes that are only + // present as a result of edges between nodes in outside_compilaton clusters. + // Suppose there is a path from outside_compilation cluster C in subgraph S + // to outside_compilation cluster D in subgraph T. If S != T then a control + // edge is added from the call node for S to the call node for T, which + // ensures that C will execute before D because S executes before T. If S==T + // then a control dependency is added between the HC nodes for C and D in S, + // and the HC node for C is added to an 'ancestors' attr in the HC node for D + // so that during compilation of the HC node for D, an XLA control dependency + // can be added to ensure C's SendToHost executes before D's RecvFromHost. class Subgraph { public: // Creates a graph to build the subgraph in, if it doesn't already exist, @@ -324,6 +343,18 @@ class Encapsulator { void RecordOutsideCompilationOutputOrControl( const string& outside_compilation_id, const Edge* edge); + // Records the fact that there is a path from a node in outside_compilation + // cluster ancestor to node in cluster successor that does not go through + // the subgraph. + void RecordOutsideCompilationDependency(const string& successor, + const string& ancestor); + + // Returns the mapping from outside_compilation cluster C to the set of + // outside_compilation clusters that have a path to C entirely outside + // compiled subgraphs. + const std::unordered_map> + OutsideCompilationAncestorMap() const; + // Adds the HostCompute nodes for each outside_compilation subgraph. Status AddHostComputes( const string& subgraph_name, @@ -406,6 +437,13 @@ class Encapsulator { Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out); + // Get the set of outside_compilation clusters and the dependency edges + // between them. + void GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map); + // Builds a _RecvAtHost node producing all the inputs of an // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host. Status AddRecvAtHostNode(const string& group_attribute, @@ -468,6 +506,14 @@ class Encapsulator { // The outside_compilation clusters in this subgraph. std::unordered_map outside_compilation_subgraphs_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path to C outside the compiled graph. + std::unordered_map> + outside_compilation_ancestors_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path from C outside the compiled graph. + std::unordered_map> + outside_compilation_successors_; // NoOp node in the output graph that is sequenced after the call node and // used to prevent host-side outside_compilation sends and recvs from being @@ -556,6 +602,10 @@ class Encapsulator { std::unordered_set, NodeSlot::PairHasher>* edges_added); + // Adds control dependencies between subgraph call nodes that have + // dependencies via outside_compilation edges. + Status AddCallNodeDependencies(Graph* graph_out); + // Adds all edges to the output graph. Status AddEdgesToOutputGraph( const std::unordered_map& node_images, @@ -620,10 +670,65 @@ class Encapsulator { const Graph* graph_in_; std::unordered_map subgraphs_; + // For each subgraph S the subgraphs S' such that there is a path in some + // outside_compilation cluster C in S to some outside_compilation cluster C' + // in S', that goes only through the uncompiled graph. + std::unordered_map> subgraph_ancestors_; TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator); }; +namespace { + +// Return in 'sorted' a topological sort of clusters according to the +// dependencies encoded in ancestors. clusters is the list of all clusters +// including clusters that are not present in the ancestors map. has_successors +// is the set of clusters that are ancestors of some other cluster. +void TopologicalClusterSort( + const std::unordered_set& clusters, + const std::unordered_set& has_successors, + const std::unordered_map>& ancestors, + std::vector* sorted) { + // The nodes are placed in 'sorted' in topological order. + sorted->clear(); + // We don't use the standard DFS because we are not operating on Node* + // objects. + struct Work { + string cluster; + bool leave; + }; + std::set visited; + std::vector stack; + // Seed the processing list with clusters that have no successors. + for (const auto& cluster : clusters) { + if (has_successors.find(cluster) == has_successors.end()) { + stack.push_back({cluster, false}); + } + } + while (!stack.empty()) { + const Work item = stack.back(); + stack.pop_back(); + if (item.leave) { + sorted->push_back(item.cluster); + continue; + } + + if (visited.find(item.cluster) != visited.end()) continue; + visited.insert(item.cluster); + + stack.push_back({item.cluster, true}); + const auto& iter = ancestors.find(item.cluster); + if (iter != ancestors.end()) { + for (const auto& ancestor : iter->second) { + stack.push_back({ancestor, false}); + } + } + } + CHECK(sorted->size() == clusters.size()); +} + +} // namespace + Node* Encapsulator::Subgraph::GetCallNodeForInputs() const { return call_node_inputs_; } @@ -786,12 +891,71 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl( } } +void Encapsulator::Subgraph::RecordOutsideCompilationDependency( + const string& successor, const string& ancestor) { + outside_compilation_ancestors_[successor].insert(ancestor); + outside_compilation_successors_[ancestor].insert(successor); +} + +const std::unordered_map> +Encapsulator::Subgraph::OutsideCompilationAncestorMap() const { + return outside_compilation_ancestors_; +} + +void Encapsulator::Subgraph::GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map) { + // During initial clustering the ancestor and successor datastructures may + // have been built including oc_cluster names that never turned into subgraphs + // because they had no edges into or out of the compiled cluster. Remove them + // before proceeding to simplify the logic. Get the set of clusters that was + // actually added, then remove references to the others. + for (const auto& oc_subgraph : outside_compilation_subgraphs_) { + clusters->insert(oc_subgraph.first); + } + for (const auto& cluster : outside_compilation_successors_) { + if (clusters->find(cluster.first) != clusters->end()) { + for (const auto& successor : cluster.second) { + if (clusters->find(successor) != clusters->end()) { + has_successor->insert(cluster.first); + break; + } + } + } + } + for (const auto& cluster : outside_compilation_ancestors_) { + if (clusters->find(cluster.first) != clusters->end()) { + std::unordered_set& ancestors = (*ancestors_map)[cluster.first]; + for (const auto& ancestor : cluster.second) { + if (clusters->find(ancestor) != clusters->end()) { + ancestors.insert(ancestor); + } + } + } + } +} + Status Encapsulator::Subgraph::AddHostComputes( const string& subgraph_name, const std::unordered_map& node_images) { - for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) { - const string& oc_subgraph_name = oc_subgraph_iter.first; - OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second; + // Get the set of outside_compilation clusters and the dependency edges + // between them. + std::unordered_set clusters; + std::unordered_set has_successor; + std::unordered_map> ancestors_map; + GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map); + // Topologically sort the outside_compilation clusters according to their + // dependency relation. + std::vector sorted_clusters; + TopologicalClusterSort(clusters, has_successor, ancestors_map, + &sorted_clusters); + + // The host compute nodes added for each outside_compilation_cluster; + std::unordered_map host_compute_node; + for (const string& oc_subgraph_name : sorted_clusters) { + OutsideCompilationSubgraph& oc_subgraph = + outside_compilation_subgraphs_[oc_subgraph_name]; if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() || !oc_subgraph.outputs_by_src.empty() || !oc_subgraph.control_outputs.empty()) { @@ -811,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes( inputs[input_index].Reset(src_image->name(), src_slot, dtype); input_dtypes[input_index] = dtype; } - for (const auto& output : oc_subgraph.outputs_by_src) { DataType dtype = output.first.dtype; int output_index = output.second; output_dtypes[output_index] = dtype; } + std::vector host_compute_ancestors; + const auto iter = ancestors_map.find(oc_subgraph_name); + if (iter != ancestors_map.end()) { + for (const string& ancestor_cluster : iter->second) { + host_compute_ancestors.push_back( + outside_compilation_subgraphs_[ancestor_cluster] + .host_compute_name); + } + } + NodeDef host_compute_def; NodeDefBuilder builder(strings::StrCat("outside_compilation_", oc_subgraph_name, "_host_compute"), @@ -825,6 +998,7 @@ Status Encapsulator::Subgraph::AddHostComputes( builder.Input(inputs); builder.Attr("Tinputs", input_dtypes); builder.Attr("Toutputs", output_dtypes); + builder.Attr("ancestors", host_compute_ancestors); builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name, "_", oc_subgraph_name)); @@ -834,6 +1008,7 @@ Status Encapsulator::Subgraph::AddHostComputes( Node* host_compute = graph_->AddNode(host_compute_def, &s); if (!s.ok()) return s; + host_compute_node[host_compute->name()] = host_compute; oc_subgraph.host_compute_name = host_compute->name(); // Connect the _HostCompute node to its producers in the subgraph. @@ -852,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes( graph_->AddControlEdge(src_image, host_compute); } + // Connect the _HostCompute node to its ancestor host compute nodes. + for (const auto& ancestor_name : host_compute_ancestors) { + Node* ancestor = host_compute_node[ancestor_name]; + graph_->AddControlEdge(ancestor, host_compute); + } + // Connect the consumers in the subgraph to the _HostCompute node. for (const auto& output : oc_subgraph.outputs_by_dst) { const Node* dst_node = output.first.node; @@ -1654,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph( return Status::OK(); } +Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) { + for (const auto& ancestors : subgraph_ancestors_) { + const string& subgraph = ancestors.first; + for (const string& ancestor : ancestors.second) { + graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(), + subgraphs_[subgraph].GetCallNodeForInputs()); + } + } + return Status::OK(); +} + Status Encapsulator::AddEdgesToOutputGraph( const std::unordered_map& node_images, bool parallel_checking, Graph* graph_out) { @@ -1703,6 +1895,7 @@ Status Encapsulator::AddEdgesToOutputGraph( Subgraph& subgraph = subgraph_entry.second; subgraph.ConnectSequencerToCallNode(graph_out); } + TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out)); return Status::OK(); } @@ -1960,6 +2153,182 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( return Status::OK(); } +namespace { + +// Helper struct for building cluster dependencies and also debugging cycles in +// the dependencies. While computing dependencies we construct a mapping from +// Node* to PathDetails. +struct PathDetails { + struct SubgraphAndCluster { + string subgraph; + string outside_compilation_cluster; + bool operator==(const SubgraphAndCluster& other) const { + return subgraph == other.subgraph && + outside_compilation_cluster == other.outside_compilation_cluster; + } + }; + + struct SubgraphAndClusterHash { + inline std::size_t operator()(const SubgraphAndCluster& v) const { + return hash()( + strings::StrCat(v.subgraph, v.outside_compilation_cluster)); + } + }; + + typedef std::unordered_set + SubgraphAndClusterSet; + + // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as + // ancestors for any successor of this node. If the node is in the outer + // graph, it returns the transitive union of the ancestors of the node's + // inputs. If the node is in an outside_compilation cluster, it returns just + // that cluster. If the node is compiled, it returns the empty set. + SubgraphAndClusterSet AncestorsForSuccessor() { + if (subgraph.empty()) { + return ancestor_clusters; + } else if (outside_compilation_cluster.empty()) { + return SubgraphAndClusterSet(); + } else { + SubgraphAndCluster entry; + entry.subgraph = subgraph; + entry.outside_compilation_cluster = outside_compilation_cluster; + return SubgraphAndClusterSet({entry}); + } + } + + // The transitive union of the ancestor's of this node's inputs. This is only + // saved for debugging in order to print out enough information to debug a + // discovered cycle. + SubgraphAndClusterSet ancestor_clusters; + // The subgraph attr on this node. + string subgraph; + // The outside_compilation attr on this node. + string outside_compilation_cluster; +}; + +// Adds an edge from ancestor to successor to the cycle detector, and returns an +// error if that edge causes the formation of a cycle. In the error case, logs +// the contents of the node_ancestors_map to facilitate debugging. +Status CheckClusterDependencyForCycles( + const string& ancestor, const string& successor, + const std::unordered_map>& ancestors, + const std::unordered_map& node_ancestors_map, + GraphCycles* cycle_detector, std::map* cycle_detector_map) { + if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) { + (*cycle_detector_map)[ancestor] = cycle_detector->NewNode(); + } + if (cycle_detector_map->find(successor) == cycle_detector_map->end()) { + (*cycle_detector_map)[successor] = cycle_detector->NewNode(); + } + + if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor], + (*cycle_detector_map)[successor])) { + LOG(ERROR) << "Cycle in outside_compilation clusters"; + for (const auto& cluster : ancestors) { + LOG(ERROR) << "Cluster " << cluster.first << " depends on:"; + for (const auto& ancestor : cluster.second) { + LOG(ERROR) << " " << ancestor; + } + } + for (const auto& node_ancestors : node_ancestors_map) { + LOG(ERROR) << "Node " << node_ancestors.first->name() << " (" + << node_ancestors.second.subgraph << ";" + << node_ancestors.second.outside_compilation_cluster + << ") has ancestor clusters:"; + for (const auto& ancestor : node_ancestors.second.ancestor_clusters) { + LOG(ERROR) << " " << ancestor.subgraph << ";" + << ancestor.outside_compilation_cluster; + } + } + return errors::InvalidArgument( + "Can't compile outside_compilation clusters because there is a " + "dependency cycle: see error log for details."); + } + return Status::OK(); +} + +} // namespace + +Status Encapsulator::FindClusterDependencies() { + // Map from nodes to ancestor details. A node is entered into the map if it is + // in a compilation subgraph, and outside_compilation cluster, or appears on a + // path in the outer graph leading from an outside_compilation subgraph. + std::unordered_map node_ancestors_map; + // We check that clusters are acyclic using this cycle detector. + GraphCycles cycle_detector; + // Map from cluster name to cycle detector node id. + std::map cycle_detector_map; + // Process the nodes in topologically-sorted order. + std::vector nodes; + GetReversePostOrder(*graph_in_, &nodes); + for (Node* node : nodes) { + string subgraph_name; + string oc_cluster; + TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster)); + // First create an entry in the ancestors map if the node is in a compiled + // subgraph or outside_compilation cluster, or if any incoming edge is from + // a node with an ancestor map entry; and find the union of all the + // ancestors. + if (!subgraph_name.empty()) { + node_ancestors_map[node].subgraph = subgraph_name; + node_ancestors_map[node].outside_compilation_cluster = oc_cluster; + } + for (Node* src : node->in_nodes()) { + const auto iter = node_ancestors_map.find(src); + if (iter != node_ancestors_map.end()) { + const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor(); + for (const auto& ancestor : ancestors_to_follow) { + if (ancestor.subgraph != subgraph_name || + ancestor.outside_compilation_cluster != oc_cluster) { + node_ancestors_map[node].ancestor_clusters.insert(ancestor); + } + } + } + } + if (!subgraph_name.empty()) { + // The node is in a compiled subgraph or an outside_compilation cluster. + if (oc_cluster.empty()) { + // The node is not in an outside_compilation cluster. Record the + // subgraph's ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph != subgraph_name) { + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } else { + Subgraph& subgraph = subgraphs_[subgraph_name]; + // The node is in an outside_compilation cluster. Record the cluster + // and/or subgraph ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph == subgraph_name) { + // The ancestor is in the same subgraph. + if (cluster.outside_compilation_cluster != oc_cluster) { + // But not in the same oc_cluster, so record the dependency. + subgraph.RecordOutsideCompilationDependency( + oc_cluster, cluster.outside_compilation_cluster); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.outside_compilation_cluster, oc_cluster, + subgraph.OutsideCompilationAncestorMap(), node_ancestors_map, + &cycle_detector, &cycle_detector_map)); + } + } else { + // The ancestor is in a different subgraph, so record the + // dependency. + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } + } + } + return Status::OK(); +} + Status Encapsulator::MakePrunedGraphCopyAndInline( const Graph& graph, const std::vector& sink_nodes, std::unique_ptr* pruned_graph, @@ -2166,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions( Encapsulator encapsulator(std::move(group_attribute), std::move(outside_compilation_attribute), &graph_in); + TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies()); TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs()); TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs( diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index 8599a7038af..3502d1bb459 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -74,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map& a, if (!compare(elt_a.first, elt_a.second, iter->second)) { if (diff) { *diff = strings::StrCat(map_name, " expected: element with key '", - key_to_string(elt_a.first), " has value '", + key_to_string(elt_a.first), "' has value '", value_to_string(elt_a.second), "' got: '", value_to_string(iter->second), "'"); } @@ -121,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, } return false; } + std::unordered_set control_input_a; + std::unordered_set control_input_b; for (int i = 0; i < a.input_size(); ++i) { - if (a.input(i) != b.input(i)) { + if (str_util::StartsWith(a.input(i), "^")) { + if (!str_util::StartsWith(b.input(i), "^")) { + if (diff) { + *diff = strings::StrCat( + diff_preamble, " mismatch for node ", a.name(), " input ", i, + ", expected control input ", a.input(i), " got ", b.input(i), + " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } + control_input_a.insert(a.input(i)); + control_input_b.insert(b.input(i)); + } else if (a.input(i) != b.input(i)) { if (diff) { *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), " input ", i, ", expected ", a.input(i), @@ -132,11 +146,29 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, return false; } } + if (control_input_a != control_input_b) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + " control inputs differ expected:\n", + a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } return EqualProtoMap( a.attr(), b.attr(), [](const string& s) { return s; }, [](const AttrValue& v) { return v.DebugString(); }, [](const string& key, const AttrValue& av, const AttrValue& bv) { - return av.DebugString() == bv.DebugString(); + if (key == "ancestors") { + // The ancestors are added from a set so the order is unpredictable; + // just compare set equality not list equality. + std::unordered_set a_set(av.list().s().begin(), + av.list().s().end()); + std::unordered_set b_set(bv.list().s().begin(), + bv.list().s().end()); + return a_set == b_set; + } else { + return av.DebugString() == bv.DebugString(); + } }, strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()), diff); @@ -261,6 +293,7 @@ REGISTER_OP("XlaHostCompute") .Output("outputs: Toutputs") .Attr("Tinputs: list(type) >= 0") .Attr("Toutputs: list(type) >= 0") + .Attr("ancestors: list(string) >= 0") .Attr("key: string") .Attr("shape_inference_graph: string = ''") .Attr("shapes: list(shape) >= 0") @@ -899,6 +932,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { {"C:o:0", "c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1044,17 +1078,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { {"D:o:0", "F:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O2"}, {"shapes", gtl::ArraySlice({})}, {"_outside_compilation_subgraph", "O2"}}, - {"F"}}, + {"F", "outside_compilation_O1_host_compute"}}, {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1193,6 +1230,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1215,6 +1253,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"G:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F2_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1279,6 +1318,179 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two functions to transform, each with one outside_compilation +// cluster, with the dependency between them purely from an outside_compilation +// edge. +TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = InputShaped(b1.opts().WithName("A")); + Node* b = InputShaped(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Binary(c, d, + b1.opts() + .WithName("E") + .WithControlInputs({b, d}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Binary(c, e, + b1.opts().WithName("F").WithControlInput(e).WithAttr( + "_encapsulate", "F1")); + Node* g = + Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2")); + Node* h = Unary(g, b1.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2")); + Binary(f, i, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, shape.opts()); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected)); + } + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1", + {DT_FLOAT}, shape.opts()); + Node* h = Unary(recv, shape.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "BinaryTest", + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, + {}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"C:o:0", "D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}, + {"D"}}, + }, + {{"f_0_retval", "F:o:0"}}); + + *library_expected.add_function() = FunctionDefHelper::Create( + "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {}, + { + {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}}, + {{"I"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"G:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F2_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F2_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"i_0_retval", "I:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = InputShaped(b2.opts().WithName("A")); + Node* b = InputShaped(b2.opts().WithName("B")); + + Node* key_constant1 = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, b2.opts()); + Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), + b2.opts() + .WithName("E") + .WithControlInputs({recv1, b}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e}, + b2.opts().WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}), + "F1"); + + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b); + Node* call1 = + b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1); + + Node* key_constant2 = + KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1", + {DT_FLOAT}, b2.opts()); + Node* h = Unary(recv2, b2.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h}, + b2.opts()); + + Node* s2 = Sequencer( + b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}), + "F2"); + NodeBuilder node_builder2("F2", "F2", lib_def.get()); + node_builder2.Input(a).Input(b); + Node* call2 = b2.opts() + .WithControlInputs({s2, call1}) + .FinalizeBuilder(&node_builder2); + Binary(call1, call2, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no inputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { @@ -1323,6 +1535,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1406,6 +1619,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1487,6 +1701,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1567,6 +1782,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1607,6 +1823,371 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the ancestor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoSrcCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(a, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(f, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, shape2.opts()); + Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"D:o:0"}}, + {{"H"}, + "UnaryTest", + {"outside_compilation_O2_host_compute:outputs:0"}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"F:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O2"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* e = Unary(a, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts()); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the successor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoDstCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + /*Node* g =*/Unary(a, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + /*Node* g =*/Unary(a, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph. +TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(d, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + /*Node* i =*/Binary(d, e, + b1.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Binary(e, h, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + {{{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O3_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"})}, + {"key", "host_compute_channel_F1_O3"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O3"}}, + {"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"}}}, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv1, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv2, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3", + {DT_FLOAT}, b2.opts()); + /*Node* i =*/Binary(recv3, e, + b2.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Node* s1 = Sequencer(b2.opts() + .WithName("F1_sequencer") + .WithControlInputs({recv1, send, recv2, recv3}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no outputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) { @@ -1731,6 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) { {"c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 86263d847ae..c0e99676849 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -813,4 +813,29 @@ Status XlaCompiler::SetHostToDeviceMetadata( return Status::OK(); } +Status XlaCompiler::GetHostComputeControlDependency( + const string& host_compute_name, xla::ComputationDataHandle* handle) { + const auto iter = host_compute_control_output_.find(host_compute_name); + if (iter == host_compute_control_output_.end()) { + return errors::InvalidArgument( + "No registered control handle for host compute Op '", host_compute_name, + "'"); + } else { + *handle = iter->second; + } + return Status::OK(); +} + +Status XlaCompiler::SetHostComputeControlDependency( + const string& host_compute_name, const xla::ComputationDataHandle& handle) { + if (host_compute_control_output_.find(host_compute_name) != + host_compute_control_output_.end()) { + return errors::InvalidArgument( + "Duplicate control handles registered for for host compute Op ", + host_compute_name); + } + host_compute_control_output_[host_compute_name] = handle; + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index a6747bbe72e..8f564f35ec8 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -325,6 +325,23 @@ class XlaCompiler { gtl::ArraySlice types, gtl::ArraySlice shapes); + // In order to avoid deadlocks from dependencies in host computations, it can + // be necessary to enforce a partial order on the execution of HostCompute + // Ops. In particular it may be necessary to constrain the SendToHost for one + // HostCompute to run before blocking on the RecvAtHost for another + // HostCompute. The compiler maintains a mapping from 'host_compute_name' to + // handle, where the handle is an 'output' of the HostCompute Op corresponding + // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced + // later can add the handle as an 'input' to enforce the constraints. + // 'host_compute_name' can be any string the client wishes to use to identify + // a given HostCompute Op as long as the names are unique within the + // compilation. + Status GetHostComputeControlDependency(const string& host_compute_name, + xla::ComputationDataHandle* handle); + Status SetHostComputeControlDependency( + const string& host_compute_name, + const xla::ComputationDataHandle& handle); + const Options& options() const { return options_; } xla::Client* client() const { return options_.client; } FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; } @@ -391,6 +408,9 @@ class XlaCompiler { std::unordered_map host_compute_sends_; std::unordered_map host_compute_recvs_; + std::unordered_map + host_compute_control_output_; + TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler); }; From d82d04f15992e224743f29aa75134ed04aa064a7 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 20 Apr 2018 13:58:51 -0700 Subject: [PATCH 0544/1734] Automated g4 rollback of changelist 193694958 PiperOrigin-RevId: 193718607 --- .../core/distributed_runtime/master_session.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index 1c67b42e761..ebe350d313d 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -89,10 +89,6 @@ class MasterSession::ReffedClientGraph : public core::RefCounted { ~ReffedClientGraph() override { if (should_deregister_) { DeregisterPartitions(); - } else { - for (Part& part : partitions_) { - worker_cache_->ReleaseWorker(part.name, part.worker); - } } } @@ -1178,8 +1174,14 @@ Status MasterSession::Create(GraphDef* graph_def, TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph( graph_def, execution_options, &execution_state_)); } - should_delete_worker_sessions_ = true; - return CreateWorkerSessions(options); + // TODO(b/36574172): Remove these conditions when ClusterSpec + // propagation is supported in all servers. + if (options.cluster_def != nullptr || + session_opts_.config.isolate_session_state()) { + should_delete_worker_sessions_ = true; + return CreateWorkerSessions(options); + } + return Status::OK(); } Status MasterSession::CreateWorkerSessions( From 9fc5bacba49eb31c7d536963879ccc62ecfbaf76 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 14:25:57 -0700 Subject: [PATCH 0545/1734] Pin rbe-debian8-tf container tp a newer base image - Also improve how numpy is installed (not compiling from source) for containers based on other distros than Ubuntu14.04 PiperOrigin-RevId: 193722848 --- tensorflow/tools/ci_build/Dockerfile.rbe.cpu | 2 +- .../tools/ci_build/install/install_pip_packages.sh | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu index 6f0798b1afc..3bc52b9ed61 100644 --- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu +++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu @@ -1,4 +1,4 @@ -FROM launcher.gcr.io/google/rbe-debian8:r322167 +FROM launcher.gcr.io/google/rbe-debian8:r327695 LABEL maintainer="Yu Yi " # Copy install scripts diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index 9644277fabf..5aaf544afdc 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -65,8 +65,13 @@ rm -rf /usr/lib/python3/dist-packages/six* # numpy needs to be installed from source to fix segfaults. See: # https://github.com/tensorflow/tensorflow/issues/6968 # This workaround isn't needed for Ubuntu 16.04 or later. -pip2 install --no-binary=:all: --upgrade numpy==1.12.0 -pip3 install --no-binary=:all: --upgrade numpy==1.12.0 +if $(cat /etc/*-release | grep -q 14.04); then + pip2 install --no-binary=:all: --upgrade numpy==1.12.0 + pip3 install --no-binary=:all: --upgrade numpy==1.12.0 +else + pip2 install --upgrade numpy==1.12.0 + pip3 install --upgrade numpy==1.12.0 +fi pip2 install scipy==0.18.1 pip3 install scipy==0.18.1 From 9f312f32091534bfc115212d2ec7c838180df663 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 14:30:48 -0700 Subject: [PATCH 0546/1734] Updating Generate Random Tensor to generate tensors whose values are small and do not cause overflow for arithmetic operations. PiperOrigin-RevId: 193723661 --- tensorflow/core/grappler/optimizers/BUILD | 1 - tensorflow/core/grappler/utils/BUILD | 1 + tensorflow/core/grappler/utils/grappler_test.h | 4 +++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 3ab8d8f584c..42c3580d40f 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -112,7 +112,6 @@ tf_cc_test( name = "constant_folding_test", srcs = ["constant_folding_test.cc"], shard_count = 5, - tags = ["noasan"], deps = [ ":constant_folding", "//tensorflow/cc:cc_ops", diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD index b473f32c450..44ef4a965b5 100644 --- a/tensorflow/core/grappler/utils/BUILD +++ b/tensorflow/core/grappler/utils/BUILD @@ -128,6 +128,7 @@ cc_library( "//tensorflow/core:direct_session", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core/grappler:grappler_item", diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h index e1394b9c35f..c2ba5ee7e8a 100644 --- a/tensorflow/core/grappler/utils/grappler_test.h +++ b/tensorflow/core/grappler/utils/grappler_test.h @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/utils.h" +#include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/public/session_options.h" @@ -62,7 +63,8 @@ class GrapplerTest : public ::testing::Test { Tensor GenerateRandomTensor(const TensorShape& shape) const { typedef typename EnumToDataType::Type T; Tensor tensor(DTYPE, shape); - tensor.flat() = tensor.flat().random(); + for (auto i = 0; i < tensor.NumElements(); i++) + tensor.flat()(i) = i + random::New64() % 10; return tensor; } From bc78f9b060cece8e29a89f7dbcdedcadbc61891d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 14:32:07 -0700 Subject: [PATCH 0547/1734] internal END_PUBLIC BEGIN_PUBLIC Automated g4 rollback of changelist 193600682 PiperOrigin-RevId: 193723856 --- .../layers/python/layers/rev_block_lib.py | 77 ++----------- .../python/layers/rev_block_lib_test.py | 102 ------------------ 2 files changed, 11 insertions(+), 168 deletions(-) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py index 9f904cc3028..02d294c68f1 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py @@ -45,7 +45,6 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest -from tensorflow.python.util import tf_inspect __all__ = ["rev_block", "RevBlock", "recompute_grad"] @@ -430,13 +429,12 @@ def enable_with_args(dec): @enable_with_args -def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False, - tensor_arg_names=None): +def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False): """Decorator that recomputes the function on the backwards pass. Args: - fn: the subgraph-producing function to wrap and recompute when computing - gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s. + fn: a function that takes Tensors (all as positional arguments) and returns + a tuple of Tensors. use_data_dep: `bool`, if `True` will use a dummy data dependency to force the recompute to happen. If `False` will use a control dependency. By default will be `True` if in an XLA context and `False` otherwise. XLA @@ -445,25 +443,17 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False, that all gradients are produced before any are consumed by downstream ops. If `use_data_dep` is also `True`, will use a data dependency instead of a control dependency. - tensor_arg_names: `list`, names of the `Tensor` arguments to `fn`. If - `None`, assumes all arguments are `Tensor`s. Returns: A wrapped fn that is identical to fn when called, but its activations will be discarded and recomputed on the backwards pass (i.e. on a call to tf.gradients). """ - if tensor_arg_names: - if not isinstance(tensor_arg_names, (list, tuple)): - raise TypeError("tensor_arg_names must be a list") @functools.wraps(fn) - def wrapped(*args, **kwargs): - tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs, - tensor_arg_names) + def wrapped(*args): return _recompute_grad( - tensor_only_fn, tensor_args, use_data_dep=use_data_dep, - tupleize_grads=tupleize_grads) + fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads) return wrapped @@ -473,59 +463,11 @@ def _is_on_tpu(): return control_flow_util.GetContainingXLAContext(ctxt) is not None -def _make_tensor_only(fn, args, kwargs, tensor_arg_names): - """Return fn such that it only takes Tensor args for tensor_arg_names.""" - argspec = tf_inspect.getargspec(fn) - if argspec.varargs is not None or argspec.keywords is not None: - raise ValueError("Function decorated with recompute_grad must not use " - "*args or **kwargs.") - fn_arg_names = list(argspec.args) - - # name_to_arg is a dict of argument name to argument value, including both - # positional and keyword arguments passed. - name_to_arg = {} - # Populate positional arguments. - for name, arg in zip(fn_arg_names[:len(args)], args): - name_to_arg[name] = arg - # Populate keyword arguments. - name_to_arg.update(kwargs) - - # Separate the Tensor arguments from the non-Tensor arguments. - # The default is that all arguments are Tensor arguments. - tensor_arg_names = tensor_arg_names or fn_arg_names - for name in tensor_arg_names: - if name not in name_to_arg: - raise ValueError("Must provide Tensor argument %s" % name) - tensor_args = [name_to_arg[name] for name in tensor_arg_names] - non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items() - if name not in tensor_arg_names]) - - # Check that Tensor arguments are in fact Tensors and that non-Tensor - # arguments are not. - for name, arg in zip(tensor_arg_names, tensor_args): - if not isinstance(arg, framework_ops.Tensor): - raise TypeError("Fn argument %s must be a Tensor." % name) - for name, arg in non_tensor_kwargs.items(): - if isinstance(arg, framework_ops.Tensor): - raise TypeError("Fn argument %s must not be a Tensor." % name) - - # Construct a Tensor-only wrapper function that will pass the non-Tensor - # arguments as well when called. - def tensor_only_fn(*tensors): - all_kwargs = dict(zip(tensor_arg_names, tensors)) - all_kwargs.update(non_tensor_kwargs) - return fn(**all_kwargs) - - return tensor_only_fn, tensor_args - - -def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, - tupleize_grads=False): +def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False): """See recompute_grad.""" for arg in args: if not isinstance(arg, framework_ops.Tensor): raise ValueError("All inputs to function must be Tensors") - use_data_dep_ = use_data_dep if use_data_dep_ == _USE_DEFAULT: use_data_dep_ = _is_on_tpu() @@ -559,11 +501,14 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, grad_vars = grads[len(inputs):] return grad_inputs, grad_vars - # TODO(rsepassi): Replace with tf.custom_gradient @_fn_with_custom_grad(grad_fn) def fn_with_recompute(*args): cached_vs.append(variable_scope.get_variable_scope()) - cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + # TODO(rsepassi): Rm conditional in TF 1.4 + if hasattr(contrib_framework_ops, "current_arg_scope"): + cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + else: + cached_arg_scope.append({}) return fn(*args) return fn_with_recompute(*args) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py index 66ccc696f92..392a490be15 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py @@ -318,108 +318,6 @@ class RecomputeTest(test.TestCase): self.assertEqual(1, len(grads)) self.assertTrue(grads[0] is not None) - def testWithNontensorArgs(self): - @rev_block_lib.recompute_grad(tupleize_grads=True, - tensor_arg_names=["inputs"]) - def layer_with_recompute(inputs, plus=None): - var = variable_scope.get_variable("var", ()) - self.assertFalse(plus) # called with False below - if plus: - return var + inputs - else: - return var * inputs - - inputs = array_ops.ones((), dtypes.float32) - outputs = layer_with_recompute(inputs, plus=False) - loss = math_ops.square(outputs) - grads = gradients_impl.gradients(loss, variables.trainable_variables()) - self.assertEqual(1, len(grads)) - self.assertTrue(grads[0] is not None) - - -class MakeTensorOnlyTest(test.TestCase): - - def testMakeTensorOnly(self): - def fn(a, b, c, d=1, e=None, f=7): - return (a, b, c, d, e, f) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - t3 = array_ops.ones(()) - args = [1, t1, 3, t2] - kwargs = {"e": t3} - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, kwargs, ["b", "d", "e"]) - self.assertAllEqual(tensor_args, [t1, t2, t3]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (1, t1, 3, t2, t3, 7)) - - def testMakeTensorOnlyPositionalArgsOnly(self): - def fn(a, b, c): - return (a, b, c) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - args = [t1, 3, t2] - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, {}, ["a", "c"]) - self.assertAllEqual(tensor_args, [t1, t2]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (t1, 3, t2)) - - def testMakeTensorOnlyKwargsArgsOnly(self): - def fn(a=1, b=2, c=3): - return (a, b, c) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - args = [t1] - kwargs = {"c": t2} - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, kwargs, ["a", "c"]) - self.assertAllEqual(tensor_args, [t1, t2]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (t1, 2, t2)) - - def testErrorOnMissingTensorArg(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch( - ValueError, "provide Tensor argument"): - rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"]) - - def testErrorOnSignatureSplats(self): - def fn1(a, *args): - return (a, args) - - err_msg = r"must not use \*args or \*\*kwargs" - with self.assertRaisesWithPredicateMatch(ValueError, err_msg): - rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"]) - - def fn2(a, **kwargs): - return (a, kwargs) - - with self.assertRaisesWithPredicateMatch(ValueError, err_msg): - rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"]) - - def testErrorOnNonTensorForTensor(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"): - rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"]) - - def testErrorOnTensorForNonTensor(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch( - TypeError, "must not be a Tensor"): - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"]) - class FnWithCustomGradTest(test.TestCase): From b133f8c70622e52f19631fd93d4b87ee21c52ac6 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 20 Apr 2018 14:58:56 -0700 Subject: [PATCH 0548/1734] Move the guts of TFE_Execute into EagerExecute PiperOrigin-RevId: 193728072 --- tensorflow/c/eager/BUILD | 1 - tensorflow/c/eager/c_api.cc | 531 +----------------- tensorflow/core/common_runtime/eager/BUILD | 21 +- .../core/common_runtime/eager/execute.cc | 489 ++++++++++++++++ .../core/common_runtime/eager/execute.h | 7 + 5 files changed, 508 insertions(+), 541 deletions(-) diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index d66386acbd6..fae922ea3b4 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -31,7 +31,6 @@ tf_cuda_library( "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", "//tensorflow/core/common_runtime/eager:execute", - "//tensorflow/core/common_runtime/eager:execute_node", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", "//tensorflow/core/common_runtime/eager:copy_to_device_node", diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index b7a30972083..975bde7c7f3 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h" #include "tensorflow/core/common_runtime/eager/execute.h" -#include "tensorflow/core/common_runtime/eager/execute_node.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/rendezvous_mgr.h" #include "tensorflow/core/framework/node_def_util.h" @@ -219,9 +218,6 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) { } return retval; } -} // extern "C" - -extern "C" { TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, TF_Status* status) { @@ -423,531 +419,18 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, attr_name, tensorflow::gtl::ArraySlice( funcs.get(), num_values)); } -} // extern "C" -namespace { - -// Initializes the step stats if needed. -void MaybeInitializeStepStats(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx) { - // Lazily initialize the RunMetadata with information about all devices if - // this is the first call. - while (step_stats->dev_stats_size() < ctx->devices()->size()) { - int device_idx = step_stats->dev_stats_size(); - auto* dev_stats = step_stats->add_dev_stats(); - dev_stats->set_device(ctx->devices()->at(device_idx)->name()); - } -} - -int StepStatsDeviceIndex(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx, - tensorflow::Device* device) { - // Find the current device's index. - if (device == nullptr) { - device = ctx->HostCPU(); - } - for (int i = 0; i < ctx->devices()->size(); ++i) { - if (ctx->devices()->at(i) == device || - ctx->devices()->at(i)->name() == device->name()) { - return i; - } - } - // TODO(apassos) do not fall back to host CPU if device is unknown. - return 0; -} - -tensorflow::Status ValidateInputTypeAndPlacement( - tensorflow::EagerContext* ctx, tensorflow::Device* op_device, - tensorflow::EagerOperation* op, const tensorflow::OpKernel* kernel, - tensorflow::RunMetadata* run_metadata) { - tensorflow::Device* host_device = ctx->HostCPU(); - const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types(); - if (memtypes.size() != op->Inputs().size()) { - return tensorflow::errors::InvalidArgument( - "expected ", memtypes.size(), " inputs, got ", op->Inputs().size()); - } - for (int i = 0; i < op->Inputs().size(); ++i) { - const tensorflow::Device* expected_device = - memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device; - tensorflow::TensorHandle* handle = op->Inputs()[i]; - tensorflow::Device* handle_device = nullptr; - TF_RETURN_IF_ERROR(handle->Device(&handle_device)); - const tensorflow::Device* actual_device = - handle_device == nullptr ? host_device : handle_device; - if (expected_device != actual_device) { - switch (ctx->GetDevicePlacementPolicy()) { - case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32: - // TODO(xpan): See if we could bubble python related error up - // to python level. - if (handle->dtype == tensorflow::DT_INT32) { - // Note: enabling silent copies of int32 tensors to match behavior - // of graph mode. - break; - } - TF_FALLTHROUGH_INTENDED; - case tensorflow::DEVICE_PLACEMENT_EXPLICIT: - return tensorflow::errors::InvalidArgument( - "Tensors on conflicting devices:" - " cannot compute ", - op->Name(), " as input #", i, " was expected to be on ", - expected_device->name(), " but is actually on ", - actual_device->name(), " (operation running on ", - op_device->name(), ")", - " Tensors can be copied explicitly using .gpu() or .cpu() " - "methods," - " or transparently copied by using tf.enable_eager_execution(" - "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors " - "between devices" - " may slow down your model"); - case tensorflow::DEVICE_PLACEMENT_WARN: - LOG(WARNING) << "before computing " << op->Name() << " input #" << i - << " was expected to be on " << expected_device->name() - << " but is actually on " << actual_device->name() - << " (operation running on " << op_device->name() - << "). This triggers a copy which can be a performance " - "bottleneck."; - break; - case tensorflow::DEVICE_PLACEMENT_SILENT: // Do nothing. - break; - } - // We are only here if the policy is warn or silent copies, so we should - // trigger a copy. - auto pre_time = tensorflow::Env::Default()->NowMicros(); - tensorflow::TensorHandle* copied_tensor = nullptr; - tensorflow::Status status = tensorflow::EagerCopyToDevice( - handle, ctx, expected_device->name().c_str(), &copied_tensor); - if (run_metadata != nullptr) { - auto* step_stats = run_metadata->mutable_step_stats(); - MaybeInitializeStepStats(step_stats, ctx); - // Record the sending on the source device for now. - int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device); - auto* dev_stats = step_stats->mutable_dev_stats(device_idx); - auto* node_stats = dev_stats->add_node_stats(); - node_stats->set_node_name("_Send"); - node_stats->set_all_start_micros(pre_time); - node_stats->set_op_end_rel_micros( - tensorflow::Env::Default()->NowMicros() - pre_time); - } - if (!status.ok()) { - if (copied_tensor != nullptr) copied_tensor->Unref(); - return tensorflow::errors::Internal( - "Failed copying input tensor from ", actual_device->name(), " to ", - expected_device->name(), " in order to run ", op->Name(), ": ", - status.error_message()); - } - handle->Unref(); - handle = copied_tensor; - (*op->MutableInputs())[i] = copied_tensor; - } - if (handle->dtype != kernel->input_type(i)) { - return tensorflow::errors::InvalidArgument( - "cannot compute ", op->Name(), " as input #", i, - " was expected to be a ", - tensorflow::DataTypeString(kernel->input_type(i)), - " tensor but is a ", tensorflow::DataTypeString(handle->dtype), - " tensor"); - } - } - return tensorflow::Status::OK(); -} - -tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef, - tensorflow::EagerContext* ctx, - TF_Status* status) { - tensorflow::DeviceSet ds; - for (tensorflow::Device* d : *ctx->devices()) { - ds.AddDevice(d); - } - tensorflow::DeviceTypeVector final_devices; - status->status = tensorflow::SupportedDeviceTypesForNode( - ds.PrioritizedDeviceTypeList(), ndef, &final_devices); - if (!status->status.ok()) { - return nullptr; - } - if (final_devices.empty()) { - status->status = tensorflow::errors::Internal( - "Could not find valid device for node ", ndef.DebugString()); - return nullptr; - } - for (tensorflow::Device* d : *ctx->devices()) { - if (d->device_type() == final_devices[0].type_string()) { - return d; - } - } - status->status = tensorflow::errors::Unknown( - "Could not find a device for node ", ndef.DebugString()); - return nullptr; -} - -#ifdef TENSORFLOW_EAGER_USE_XLA -// Synthesizes and returns a wrapper function over `op`, which must be a -// primitive op (e.g. matmul). -// -// The wrapper function conforms to the function signature expected by -// _XlaLaunchOp, with input params ordered by . For example, if the op has input params , they will be reordered to as the input params to the synthesized function. -// -// It populates `const_input_types`, `arg_input_types` and -// `op_input_to_func_input` based on the reordering results, that the caller can -// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets -// `status` accordingly. -const tensorflow::FunctionDef* OpToFunction( - TFE_Op* op, std::vector* const_input_types, - std::vector* arg_input_types, - tensorflow::gtl::FlatMap* op_input_to_func_input, - TF_Status* status) { - DCHECK(!op->operation.is_function()); - - tensorflow::FunctionDef fdef; - - // Get the OpDef of the op we are trying to encapsulate. - TFE_Context* ctx = op->operation.ctx; - const tensorflow::OpRegistrationData* op_data; - { - status->status = - ctx->context.FindFunctionOpData(op->operation.Name(), &op_data); - if (!status->status.ok()) { - return nullptr; - } - } - const tensorflow::OpDef& op_def = op_data->op_def; - - tensorflow::OpDef* signature = fdef.mutable_signature(); - - // Handle constant inputs. - const std::unordered_set const_inputs( - *tensorflow::XlaOpRegistry::CompileTimeConstantInputs( - op->operation.Name())); - - // First add place holders for the input args, so that we can refer to them by - // position in the next loop. Also tally up the resource inputs. - int num_resource_inputs = 0; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) { - ++num_resource_inputs; - } - signature->add_input_arg(); - } - - // Now we map the input params from `op_def` to `signature`, where the param - // ordering for `signature` is: . - int const_index = 0; - int arg_index = const_inputs.size(); - int resource_index = op_def.input_arg_size() - num_resource_inputs; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i); - tensorflow::OpDef::ArgDef* func_input_arg = nullptr; - if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) { - VLOG(1) << "For const input, mapping op input " << i << " to func input " - << const_index; - (*op_input_to_func_input)[i] = const_index; - func_input_arg = signature->mutable_input_arg(const_index++); - const_input_types->push_back( - static_cast(op->operation.Inputs()[i]->dtype)); - } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) { - VLOG(1) << "For resource input, mapping op input " << i - << " to func input " << resource_index; - (*op_input_to_func_input)[i] = resource_index; - func_input_arg = signature->mutable_input_arg(resource_index++); - } else { - VLOG(1) << "For arg input, mapping op input " << i << " to func input " - << arg_index; - (*op_input_to_func_input)[i] = arg_index; - func_input_arg = signature->mutable_input_arg(arg_index++); - arg_input_types->push_back( - static_cast(op->operation.Inputs()[i]->dtype)); - } - - func_input_arg->set_name(op_input_arg.name()); - func_input_arg->set_type(op->operation.Inputs()[i]->dtype); - } - VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString(); - - // Resources args are at the end of the function input params, and we should - // have iterated over all of them. - DCHECK_EQ(signature->input_arg_size(), resource_index); - - // Make the synthesized function's name unique. - signature->set_name(tensorflow::strings::StrCat( - op_def.name(), func_id_generator.fetch_add(1))); - - // Add the node def and set its input names to match op_def's names. - const tensorflow::NodeDef& ndef = - op->operation.MutableAttrs()->BuildNodeDef(); - DCHECK_EQ(signature->input_arg_size(), ndef.input_size()); - *fdef.add_node_def() = ndef; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name()); - } - VLOG(1) << "Added NodeDef: " << fdef.DebugString(); - - // Fix the output names and set output types. - for (int i = 0; i < op_def.output_arg_size(); ++i) { - tensorflow::OpDef::ArgDef* arg = signature->add_output_arg(); - const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i); - const string& out_tensor_name = tensorflow::strings::StrCat( - ndef.name(), ":", op_def_arg.name(), ":", 0); - arg->set_name(op_def_arg.name()); - (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name; - const string& type_attr = op_def_arg.type_attr(); - if (!type_attr.empty()) { - auto i = ndef.attr().find(type_attr); - if (i == ndef.attr().end()) { - status->status = tensorflow::errors::InvalidArgument( - tensorflow::strings::StrCat("Could not find attr ", type_attr, - " in NodeDef ", ndef.DebugString())); - return nullptr; - } - arg->set_type(i->second.type()); - } - } - VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString(); - - status->status = ctx->context.AddFunctionDef(fdef); - if (!status->status.ok()) return nullptr; - const auto ret = ctx->context.FindFunctionDef(signature->name()); - DCHECK(ret != nullptr); - return ret; -} - -// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed -// via XLA. -std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { - VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name(); - auto launch_op = std::unique_ptr( - TFE_NewOp(op->operation.ctx, "_XlaLaunch", status)); - if (TF_GetCode(status) != TF_OK) return nullptr; - if (op->operation.device) { - TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(), - status); - if (TF_GetCode(status) != TF_OK) return nullptr; - } - - const tensorflow::FunctionDef* fdef; - { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); } - std::vector const_input_types; - std::vector arg_input_types; - tensorflow::gtl::FlatMap op_input_to_func_input; - if (fdef == nullptr) { - // See if this is a primitive op, and if so create a function for it, so - // that _XlaLaunchOp can access it. - fdef = OpToFunction(op, &const_input_types, &arg_input_types, - &op_input_to_func_input, status); - if (!status->status.ok()) return nullptr; - } else { - // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for - // functions, so we need to find another way to handle constant inputs. - for (int i = const_input_types.size(); - i < fdef->signature().input_arg_size(); ++i) { - VLOG(1) << "Adding Targs from input arg " << i; - const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i); - arg_input_types.push_back(static_cast(arg.type())); - } - } - DCHECK(fdef != nullptr); - - // Copy inputs and their devices. - // Since input param reordering may have occurred between `op` and `launch_op` - // via `op_input_to_func_input`, adjust the actual inputs accordingly. - *launch_op->operation.MutableInputs() = op->operation.Inputs(); - for (tensorflow::TensorHandle* h : launch_op->operation.Inputs()) { - h->Ref(); - } - if (!op_input_to_func_input.empty()) { - DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size()); - for (int i = 0; i < op_input_to_func_input.size(); ++i) { - VLOG(1) << "mapping op input " << i << " to func input " - << op_input_to_func_input[i]; - - (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] = - op->operation.Inputs()[i]; - } - } - launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size()); - - TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(), - const_input_types.size()); - - // Set Targs and Nresources attrs. - TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(), - arg_input_types.size()); - const int num_resource_inputs = fdef->signature().input_arg_size() - - const_input_types.size() - - arg_input_types.size(); - TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs); - - // Set Tresults attr. - std::vector tresults; - for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) { - tresults.push_back(static_cast(arg.type())); - } - TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(), - tresults.size()); - - // Set function attr. - tensorflow::AttrValue attr_value; - tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(fdef->signature().name()); - launch_op->attrs.Set("function", attr_value); - - return launch_op; -} -#endif // TENSORFLOW_EAGER_USE_XLA - -} // namespace - -extern "C" { - -void TFE_Execute(TFE_Op* tfe_op, TFE_TensorHandle** retvals, int* num_retvals, +void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, TF_Status* status) { - tensorflow::EagerOperation* op = &tfe_op->operation; - tensorflow::EagerContext* ctx = op->EagerContext(); - status->status = ctx->GetStatus(); + tensorflow::gtl::InlinedVector handle_retvals( + *num_retvals); + status->status = + tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals); if (!status->status.ok()) { return; } -#ifdef TENSORFLOW_EAGER_USE_XLA - std::unique_ptr xla_launch_op; - if (op->UseXla() && op->Name() != "_XlaLaunch") { - xla_launch_op = BuildXlaLaunch(op, status); - if (!status->status.ok()) { - return; - } - op = xla_launch_op.get(); - } -#endif // TENSORFLOW_EAGER_USE_XLA - // Ensure all resource-touching ops run in the device the resource is, - // regardless of anything else that has been specified. This is identical to - // the graph mode behavior. - for (int i = 0; i < op->Inputs().size(); ++i) { - tensorflow::Device* input_op_device = nullptr; - status->status = op->Inputs()[i]->OpDevice(&input_op_device); - if (!status->status.ok()) return; - VLOG(2) << "for op " << op->Name() << " input " << i << " " - << tensorflow::DataTypeString(op->Inputs()[i]->dtype) << " " - << (input_op_device == nullptr ? "cpu" : input_op_device->name()) - << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name()); - if (op->Inputs()[i]->dtype == tensorflow::DT_RESOURCE && - (input_op_device != op->Device() || input_op_device == nullptr)) { - tensorflow::Device* d = - input_op_device == nullptr ? ctx->HostCPU() : input_op_device; - VLOG(1) << "Changing device of operation " << op->Name() << " to " - << d->name() << " because input #" << i - << " is a resource in this device."; - op->SetDevice(d); - } - } - tensorflow::Device* device = op->Device(); - - tensorflow::Fprint128 cache_key = op->MutableAttrs()->CacheKey( - device == nullptr ? "unspecified" : device->name()); - tensorflow::KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key); - if (kernel == nullptr) { - const tensorflow::NodeDef& ndef = op->MutableAttrs()->BuildNodeDef(); - if (device == nullptr) { - device = SelectDevice(ndef, ctx, status); - if (!status->status.ok()) { - return; - } - } - CHECK(device != nullptr); - if (ctx->LogDevicePlacement()) { - LOG(INFO) << "Executing op " << ndef.op() << " in device " - << device->name(); - } - kernel = new tensorflow::KernelAndDevice(ctx->GetRendezvous()); - // Knowledge of the implementation of Init (and in-turn - // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def - // will be accessed, so grab on to the lock. - // See WARNING comment in Execute (before kernel->Run) - would be nice to - // rework to avoid this subtlety. - tensorflow::tf_shared_lock l(*ctx->FunctionsMu()); - status->status = - tensorflow::KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel); - if (!status->status.ok()) { - delete kernel; - return; - } - // Update output_dtypes inside `kernel`. - const tensorflow::OpDef* op_def = nullptr; - const tensorflow::FunctionDef* function_def = - ctx->FuncLibDef()->Find(ndef.op()); - if (function_def != nullptr) { - op_def = &(function_def->signature()); - } - if (op_def == nullptr) { - status->status = OpDefForOp(ndef.op().c_str(), &op_def); - if (!status->status.ok()) { - return; - } - } - tensorflow::DataTypeVector input_dtypes; - status->status = InOutTypesForNode(ndef, *op_def, &input_dtypes, - kernel->mutable_output_dtypes()); - if (!status->status.ok()) { - return; - } - ctx->AddKernelToCache(cache_key, kernel); - } - const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes(); - const int output_dtypes_size = output_dtypes.size(); - if (output_dtypes_size > *num_retvals) { - TF_SetStatus(status, TF_INVALID_ARGUMENT, - tensorflow::strings::StrCat("Expecting ", output_dtypes.size(), - " outputs, but *num_retvals is ", - *num_retvals) - .c_str()); - return; - } - *num_retvals = output_dtypes_size; - if (device == nullptr) { - // TODO(apassos) debug how the assignment below might return a different - // device from the one requested above. - device = kernel->device(); - } - status->status = ValidateInputTypeAndPlacement( - ctx, device, op, kernel->kernel(), - ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr); - if (!status->status.ok()) return; - std::unique_ptr maybe_stats; - if (ctx->ShouldStoreMetadata()) { - maybe_stats.reset(new tensorflow::NodeExecStats); - maybe_stats->set_node_name(op->Name()); - maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros()); - maybe_stats->set_op_start_rel_micros(0); - maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros()); - // TODO(apassos) track referenced tensors - } - if (ctx->Async()) { - // Note that for async mode, execution order will make sure that all - // input handles are ready before executing them. - // TODO(agarwal): Consider executing "cheap" kernels inline for performance. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - tensorflow::uint64 id = ctx->NextId(); - for (int i = 0; i < *num_retvals; ++i) { - tensorflow::TensorHandle* h = - new tensorflow::TensorHandle(id, output_dtypes[i], ctx); - retvals[i] = new TFE_TensorHandle(h); - handle_retvals[i] = h; - } - tensorflow::EagerNode* node = new tensorflow::ExecuteNode( - id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(), - output_dtypes, handle_retvals); - ctx->ExecutorAdd(node); - } else { - // Execute checks if retvals[i] is nullptr or not to figure if it needs to - // allocate it. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - status->status = tensorflow::EagerExecute( - ctx, op->Device(), op->Inputs(), kernel, maybe_stats.get(), - handle_retvals.data(), *num_retvals); - for (int i = 0; i < *num_retvals; ++i) { - retvals[i] = new TFE_TensorHandle(handle_retvals[i]); - } + for (int i = 0; i < *num_retvals; ++i) { + retvals[i] = new TFE_TensorHandle(handle_retvals[i]); } } diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD index 00ac4a4e478..13d6b021b54 100644 --- a/tensorflow/core/common_runtime/eager/BUILD +++ b/tensorflow/core/common_runtime/eager/BUILD @@ -154,26 +154,15 @@ tf_cc_test( cc_library( name = "execute", srcs = ["execute.cc"], - hdrs = ["execute.h"], + hdrs = [ + "execute.h", + "execute_node.h", + ], deps = [ ":context", ":copy_to_device_node", - ":kernel_and_device", - ":tensor_handle", - "//tensorflow/core:core_cpu_lib", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:protos_all_cc", - ], -) - -cc_library( - name = "execute_node", - hdrs = ["execute_node.h"], - deps = [ - ":context", ":eager_executor", - ":execute", + ":eager_operation", ":kernel_and_device", ":tensor_handle", "//tensorflow/core:core_cpu_lib", diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 98e8471102b..a514f81e146 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -18,8 +18,10 @@ limitations under the License. #include #include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h" +#include "tensorflow/core/common_runtime/eager/execute_node.h" #include "tensorflow/core/common_runtime/eager/kernel_and_device.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/framework/step_stats.pb.h" @@ -32,6 +34,493 @@ limitations under the License. namespace tensorflow { +namespace { + +// Initializes the step stats if needed. +void MaybeInitializeStepStats(StepStats* step_stats, EagerContext* ctx) { + // Lazily initialize the RunMetadata with information about all devices if + // this is the first call. + while (step_stats->dev_stats_size() < ctx->devices()->size()) { + int device_idx = step_stats->dev_stats_size(); + auto* dev_stats = step_stats->add_dev_stats(); + dev_stats->set_device(ctx->devices()->at(device_idx)->name()); + } +} + +int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx, + Device* device) { + // Find the current device's index. + if (device == nullptr) { + device = ctx->HostCPU(); + } + for (int i = 0; i < ctx->devices()->size(); ++i) { + if (ctx->devices()->at(i) == device || + ctx->devices()->at(i)->name() == device->name()) { + return i; + } + } + // TODO(apassos) do not fall back to host CPU if device is unknown. + return 0; +} + +Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device, + EagerOperation* op, const OpKernel* kernel, + RunMetadata* run_metadata) { + Device* host_device = ctx->HostCPU(); + const MemoryTypeVector& memtypes = kernel->input_memory_types(); + if (memtypes.size() != op->Inputs().size()) { + return errors::InvalidArgument("expected ", memtypes.size(), + " inputs, got ", op->Inputs().size()); + } + for (int i = 0; i < op->Inputs().size(); ++i) { + const Device* expected_device = + memtypes[i] == HOST_MEMORY ? host_device : op_device; + TensorHandle* handle = op->Inputs()[i]; + Device* handle_device = nullptr; + TF_RETURN_IF_ERROR(handle->Device(&handle_device)); + const Device* actual_device = + handle_device == nullptr ? host_device : handle_device; + if (expected_device != actual_device) { + switch (ctx->GetDevicePlacementPolicy()) { + case DEVICE_PLACEMENT_SILENT_FOR_INT32: + // TODO(xpan): See if we could bubble python related error up + // to python level. + if (handle->dtype == DT_INT32) { + // Note: enabling silent copies of int32 tensors to match behavior + // of graph mode. + break; + } + TF_FALLTHROUGH_INTENDED; + case DEVICE_PLACEMENT_EXPLICIT: + return errors::InvalidArgument( + "Tensors on conflicting devices:" + " cannot compute ", + op->Name(), " as input #", i, " was expected to be on ", + expected_device->name(), " but is actually on ", + actual_device->name(), " (operation running on ", + op_device->name(), ")", + " Tensors can be copied explicitly using .gpu() or .cpu() " + "methods," + " or transparently copied by using tf.enable_eager_execution(" + "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors " + "between devices" + " may slow down your model"); + case DEVICE_PLACEMENT_WARN: + LOG(WARNING) << "before computing " << op->Name() << " input #" << i + << " was expected to be on " << expected_device->name() + << " but is actually on " << actual_device->name() + << " (operation running on " << op_device->name() + << "). This triggers a copy which can be a performance " + "bottleneck."; + break; + case DEVICE_PLACEMENT_SILENT: // Do nothing. + break; + } + // We are only here if the policy is warn or silent copies, so we should + // trigger a copy. + auto pre_time = Env::Default()->NowMicros(); + TensorHandle* copied_tensor = nullptr; + Status status = EagerCopyToDevice( + handle, ctx, expected_device->name().c_str(), &copied_tensor); + if (run_metadata != nullptr) { + auto* step_stats = run_metadata->mutable_step_stats(); + MaybeInitializeStepStats(step_stats, ctx); + // Record the sending on the source device for now. + int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device); + auto* dev_stats = step_stats->mutable_dev_stats(device_idx); + auto* node_stats = dev_stats->add_node_stats(); + node_stats->set_node_name("_Send"); + node_stats->set_all_start_micros(pre_time); + node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() - + pre_time); + } + if (!status.ok()) { + if (copied_tensor != nullptr) copied_tensor->Unref(); + return errors::Internal("Failed copying input tensor from ", + actual_device->name(), " to ", + expected_device->name(), " in order to run ", + op->Name(), ": ", status.error_message()); + } + handle->Unref(); + handle = copied_tensor; + (*op->MutableInputs())[i] = copied_tensor; + } + if (handle->dtype != kernel->input_type(i)) { + return errors::InvalidArgument( + "cannot compute ", op->Name(), " as input #", i, + " was expected to be a ", DataTypeString(kernel->input_type(i)), + " tensor but is a ", DataTypeString(handle->dtype), " tensor"); + } + } + return Status::OK(); +} + +Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) { + DeviceSet ds; + for (Device* d : *ctx->devices()) { + ds.AddDevice(d); + } + DeviceTypeVector final_devices; + auto status = SupportedDeviceTypesForNode(ds.PrioritizedDeviceTypeList(), + ndef, &final_devices); + if (!status.ok()) return status; + if (final_devices.empty()) { + return errors::Internal("Could not find valid device for node ", + ndef.DebugString()); + } + for (Device* d : *ctx->devices()) { + if (d->device_type() == final_devices[0].type_string()) { + *device = d; + return Status::OK(); + } + } + return errors::Unknown("Could not find a device for node ", + ndef.DebugString()); +} + +#ifdef TENSORFLOW_EAGER_USE_XLA +// Synthesizes and returns a wrapper function over `op`, which must be a +// primitive op (e.g. matmul). +// +// The wrapper function conforms to the function signature expected by +// _XlaLaunchOp, with input params ordered by . For example, if the op has input params , they will be reordered to as the input params to the synthesized function. +// +// It populates `const_input_types`, `arg_input_types` and +// `op_input_to_func_input` based on the reordering results, that the caller can +// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets +// `status` accordingly. +const FunctionDef* OpToFunction(TFE_Op* op, + std::vector* const_input_types, + std::vector* arg_input_types, + gtl::FlatMap* op_input_to_func_input, + TF_Status* status) { + DCHECK(!op->operation.is_function()); + + FunctionDef fdef; + + // Get the OpDef of the op we are trying to encapsulate. + TFE_Context* ctx = op->operation.ctx; + const OpRegistrationData* op_data; + { + status = ctx->context.FindFunctionOpData(op->operation.Name(), &op_data); + if (!status.ok()) { + return nullptr; + } + } + const OpDef& op_def = op_data->op_def; + + OpDef* signature = fdef.mutable_signature(); + + // Handle constant inputs. + const std::unordered_set const_inputs( + *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name())); + + // First add place holders for the input args, so that we can refer to them by + // position in the next loop. Also tally up the resource inputs. + int num_resource_inputs = 0; + for (int i = 0; i < op_def.input_arg_size(); ++i) { + if (op_def.input_arg(i).type() == DT_RESOURCE) { + ++num_resource_inputs; + } + signature->add_input_arg(); + } + + // Now we map the input params from `op_def` to `signature`, where the param + // ordering for `signature` is: . + int const_index = 0; + int arg_index = const_inputs.size(); + int resource_index = op_def.input_arg_size() - num_resource_inputs; + for (int i = 0; i < op_def.input_arg_size(); ++i) { + const OpDef::ArgDef& op_input_arg = op_def.input_arg(i); + OpDef::ArgDef* func_input_arg = nullptr; + if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) { + VLOG(1) << "For const input, mapping op input " << i << " to func input " + << const_index; + (*op_input_to_func_input)[i] = const_index; + func_input_arg = signature->mutable_input_arg(const_index++); + const_input_types->push_back( + static_cast(op->operation.Inputs()[i]->dtype)); + } else if (op_input_arg.type() == DT_RESOURCE) { + VLOG(1) << "For resource input, mapping op input " << i + << " to func input " << resource_index; + (*op_input_to_func_input)[i] = resource_index; + func_input_arg = signature->mutable_input_arg(resource_index++); + } else { + VLOG(1) << "For arg input, mapping op input " << i << " to func input " + << arg_index; + (*op_input_to_func_input)[i] = arg_index; + func_input_arg = signature->mutable_input_arg(arg_index++); + arg_input_types->push_back( + static_cast(op->operation.Inputs()[i]->dtype)); + } + + func_input_arg->set_name(op_input_arg.name()); + func_input_arg->set_type(op->operation.Inputs()[i]->dtype); + } + VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString(); + + // Resources args are at the end of the function input params, and we should + // have iterated over all of them. + DCHECK_EQ(signature->input_arg_size(), resource_index); + + // Make the synthesized function's name unique. + signature->set_name( + strings::StrCat(op_def.name(), func_id_generator.fetch_add(1))); + + // Add the node def and set its input names to match op_def's names. + const NodeDef& ndef = op->operation.MutableAttrs()->BuildNodeDef(); + DCHECK_EQ(signature->input_arg_size(), ndef.input_size()); + *fdef.add_node_def() = ndef; + for (int i = 0; i < op_def.input_arg_size(); ++i) { + fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name()); + } + VLOG(1) << "Added NodeDef: " << fdef.DebugString(); + + // Fix the output names and set output types. + for (int i = 0; i < op_def.output_arg_size(); ++i) { + OpDef::ArgDef* arg = signature->add_output_arg(); + const OpDef::ArgDef& op_def_arg = op_def.output_arg(i); + const string& out_tensor_name = + strings::StrCat(ndef.name(), ":", op_def_arg.name(), ":", 0); + arg->set_name(op_def_arg.name()); + (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name; + const string& type_attr = op_def_arg.type_attr(); + if (!type_attr.empty()) { + auto i = ndef.attr().find(type_attr); + if (i == ndef.attr().end()) { + status = errors::InvalidArgument( + strings::StrCat("Could not find attr ", type_attr, " in NodeDef ", + ndef.DebugString())); + return nullptr; + } + arg->set_type(i->second.type()); + } + } + VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString(); + + status = ctx->context.AddFunctionDef(fdef); + if (!status.ok()) return nullptr; + const auto ret = ctx->context.FindFunctionDef(signature->name()); + DCHECK(ret != nullptr); + return ret; +} + +// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed +// via XLA. +std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { + VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name(); + auto launch_op = std::unique_ptr( + TFE_NewOp(op->operation.ctx, "_XlaLaunch", status)); + if (TF_GetCode(status) != TF_OK) return nullptr; + if (op->operation.device) { + TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(), + status); + if (TF_GetCode(status) != TF_OK) return nullptr; + } + + const FunctionDef* fdef; + { fdef = op->operation.ctx->FindFunctionDef(op->operation.Name()); } + std::vector const_input_types; + std::vector arg_input_types; + gtl::FlatMap op_input_to_func_input; + if (fdef == nullptr) { + // See if this is a primitive op, and if so create a function for it, so + // that _XlaLaunchOp can access it. + fdef = OpToFunction(op, &const_input_types, &arg_input_types, + &op_input_to_func_input, status); + if (!status.ok()) return nullptr; + } else { + // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for + // functions, so we need to find another way to handle constant inputs. + for (int i = const_input_types.size(); + i < fdef->signature().input_arg_size(); ++i) { + VLOG(1) << "Adding Targs from input arg " << i; + const OpDef::ArgDef& arg = fdef->signature().input_arg(i); + arg_input_types.push_back(static_cast(arg.type())); + } + } + DCHECK(fdef != nullptr); + + // Copy inputs and their devices. + // Since input param reordering may have occurred between `op` and `launch_op` + // via `op_input_to_func_input`, adjust the actual inputs accordingly. + *launch_op->operation.MutableInputs() = op->operation.Inputs(); + for (TensorHandle* h : launch_op->operation.Inputs()) { + h->Ref(); + } + if (!op_input_to_func_input.empty()) { + DCHECK_EQ(op->operation.Inputs().size(), op_input_to_func_input.size()); + for (int i = 0; i < op_input_to_func_input.size(); ++i) { + VLOG(1) << "mapping op input " << i << " to func input " + << op_input_to_func_input[i]; + + (*launch_op->operation.MuableInputs())[op_input_to_func_input[i]] = + op->operation.Inputs()[i]; + } + } + launch_op->operation.MutableAttrs()->NumInputs(op->operation.Inputs().size()); + + TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(), + const_input_types.size()); + + // Set Targs and Nresources attrs. + TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(), + arg_input_types.size()); + const int num_resource_inputs = fdef->signature().input_arg_size() - + const_input_types.size() - + arg_input_types.size(); + TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs); + + // Set Tresults attr. + std::vector tresults; + for (const OpDef::ArgDef& arg : fdef->signature().output_arg()) { + tresults.push_back(static_cast(arg.type())); + } + TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(), + tresults.size()); + + // Set function attr. + AttrValue attr_value; + NameAttrList* func = attr_value.mutable_func(); + func->set_name(fdef->signature().name()); + launch_op->attrs.Set("function", attr_value); + + return launch_op; +} +#endif // TENSORFLOW_EAGER_USE_XLA + +} // namespace + +Status EagerExecute(EagerOperation* op, + gtl::InlinedVector* retvals, + int* num_retvals) { + EagerContext* ctx = op->EagerContext(); + auto status = ctx->GetStatus(); + if (!status.ok()) return status; +#ifdef TENSORFLOW_EAGER_USE_XLA + std::unique_ptr xla_launch_op; + if (op->UseXla() && op->Name() != "_XlaLaunch") { + xla_launch_op = BuildXlaLaunch(op, status); + if (!status.ok()) return status; + op = xla_launch_op.get(); + } +#endif // TENSORFLOW_EAGER_USE_XLA + // Ensure all resource-touching ops run in the device the resource is, + // regardless of anything else that has been specified. This is identical to + // the graph mode behavior. + for (int i = 0; i < op->Inputs().size(); ++i) { + Device* input_op_device = nullptr; + status = op->Inputs()[i]->OpDevice(&input_op_device); + if (!status.ok()) return status; + VLOG(2) << "for op " << op->Name() << " input " << i << " " + << DataTypeString(op->Inputs()[i]->dtype) << " " + << (input_op_device == nullptr ? "cpu" : input_op_device->name()) + << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name()); + if (op->Inputs()[i]->dtype == DT_RESOURCE && + (input_op_device != op->Device() || input_op_device == nullptr)) { + Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device; + VLOG(1) << "Changing device of operation " << op->Name() << " to " + << d->name() << " because input #" << i + << " is a resource in this device."; + op->SetDevice(d); + } + } + Device* device = op->Device(); + + Fprint128 cache_key = op->MutableAttrs()->CacheKey( + device == nullptr ? "unspecified" : device->name()); + KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key); + if (kernel == nullptr) { + const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef(); + if (device == nullptr) { + status = SelectDevice(ndef, ctx, &device); + if (!status.ok()) return status; + } + CHECK(device != nullptr); + if (ctx->LogDevicePlacement()) { + LOG(INFO) << "Executing op " << ndef.op() << " in device " + << device->name(); + } + kernel = new KernelAndDevice(ctx->GetRendezvous()); + // Knowledge of the implementation of Init (and in-turn + // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def + // will be accessed, so grab on to the lock. + // See WARNING comment in Execute (before kernel->Run) - would be nice to + // rework to avoid this subtlety. + tf_shared_lock l(*ctx->FunctionsMu()); + status = KernelAndDevice::Init(ndef, ctx->func_lib(device), kernel); + if (!status.ok()) { + delete kernel; + return status; + } + // Update output_dtypes inside `kernel`. + const OpDef* op_def = nullptr; + const FunctionDef* function_def = ctx->FuncLibDef()->Find(ndef.op()); + if (function_def != nullptr) { + op_def = &(function_def->signature()); + } + if (op_def == nullptr) { + status = OpDefForOp(ndef.op().c_str(), &op_def); + if (!status.ok()) return status; + } + DataTypeVector input_dtypes; + status = InOutTypesForNode(ndef, *op_def, &input_dtypes, + kernel->mutable_output_dtypes()); + if (!status.ok()) return status; + ctx->AddKernelToCache(cache_key, kernel); + } + const DataTypeVector& output_dtypes = kernel->output_dtypes(); + const int output_dtypes_size = static_cast(output_dtypes.size()); + if (output_dtypes_size > *num_retvals) { + return errors::InvalidArgument("Expecting ", output_dtypes.size(), + " outputs, but *num_retvals is ", + *num_retvals); + } + *num_retvals = output_dtypes_size; + if (device == nullptr) { + // TODO(apassos) debug how the assignment below might return a different + // device from the one requested above. + device = kernel->device(); + } + status = ValidateInputTypeAndPlacement( + ctx, device, op, kernel->kernel(), + ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr); + if (!status.ok()) return status; + std::unique_ptr maybe_stats; + if (ctx->ShouldStoreMetadata()) { + maybe_stats.reset(new NodeExecStats); + maybe_stats->set_node_name(op->Name()); + maybe_stats->set_all_start_micros(Env::Default()->NowMicros()); + maybe_stats->set_op_start_rel_micros(0); + maybe_stats->set_scheduled_micros(Env::Default()->NowMicros()); + // TODO(apassos) track referenced tensors + } + retvals->resize(*num_retvals); + if (ctx->Async()) { + // Note that for async mode, execution order will make sure that all + // input handles are ready before executing them. + // TODO(agarwal): Consider executing "cheap" kernels inline for performance. + tensorflow::uint64 id = ctx->NextId(); + for (int i = 0; i < *num_retvals; ++i) { + (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx); + } + EagerNode* node = + new ExecuteNode(id, ctx, op->Device(), op->Inputs(), kernel, + maybe_stats.release(), output_dtypes, *retvals); + ctx->ExecutorAdd(node); + } else { + // Execute checks if retvals[i] is nullptr or not to figure if it needs to + // allocate it. + status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel, + maybe_stats.get(), retvals->data(), *num_retvals); + } + + return status; +} + Status EagerExecute(EagerContext* ctx, Device* device, const gtl::InlinedVector& op_inputs, KernelAndDevice* kernel, NodeExecStats* maybe_stats, diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h index 0f6ad031e1d..7c8d7e164d0 100644 --- a/tensorflow/core/common_runtime/eager/execute.h +++ b/tensorflow/core/common_runtime/eager/execute.h @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/common_runtime/eager/eager_operation.h" #include "tensorflow/core/common_runtime/eager/kernel_and_device.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/framework/step_stats.pb.h" @@ -25,6 +26,12 @@ limitations under the License. namespace tensorflow { +// Utility function that executes a fully constructed EagerOperation. +Status EagerExecute( + EagerOperation* op, + tensorflow::gtl::InlinedVector* retvals, + int* num_retvals); + // Low-level utility to execute the kernel specified by kernel on device device, // with the inputs op_inputs, in the context ctx. Status EagerExecute(EagerContext* ctx, Device* device, From 60a0e2f5261cf72da4e4d8e65b56b695d611b984 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 15:19:59 -0700 Subject: [PATCH 0549/1734] Do not force default layout when there is no need to. Allow the inner computations to negotiate a root and parameter layouts different from default. PiperOrigin-RevId: 193731341 --- tensorflow/compiler/xla/service/BUILD | 3 + .../xla/service/computation_layout.cc | 7 +- .../compiler/xla/service/computation_layout.h | 5 +- .../compiler/xla/service/hlo_instruction.h | 8 + .../compiler/xla/service/layout_assignment.cc | 326 +++++++++++++----- .../compiler/xla/service/layout_assignment.h | 65 +++- tensorflow/compiler/xla/service/service.cc | 5 +- .../compiler/xla/service/tuple_simplifier.cc | 25 +- 8 files changed, 324 insertions(+), 120 deletions(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 9555d918178..bc577c173d6 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1953,10 +1953,12 @@ cc_library( deps = [ ":computation_layout", ":hlo", + ":hlo_dce", ":hlo_graph_dumper", ":hlo_pass", ":logical_buffer", ":tuple_points_to_analysis", + ":tuple_simplifier", "//tensorflow/compiler/xla:shape_layout", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -2433,6 +2435,7 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc index d2d4f14fcec..cb61f3da39f 100644 --- a/tensorflow/compiler/xla/service/computation_layout.cc +++ b/tensorflow/compiler/xla/service/computation_layout.cc @@ -23,12 +23,15 @@ limitations under the License. namespace xla { -ComputationLayout::ComputationLayout(const ProgramShape& program_shape) +ComputationLayout::ComputationLayout(const ProgramShape& program_shape, + bool ignore_layouts) : result_layout_(program_shape.result()) { for (auto& shape : program_shape.parameters()) { parameter_layouts_.emplace_back(shape); } - SetToDefaultLayout(); + if (ignore_layouts) { + SetToDefaultLayout(); + } } void ComputationLayout::SetToDefaultLayout() { diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h index 80e102411c7..53c3a3f7b73 100644 --- a/tensorflow/compiler/xla/service/computation_layout.h +++ b/tensorflow/compiler/xla/service/computation_layout.h @@ -34,8 +34,9 @@ class ComputationLayout { public: // Constructs a ComputationLayout from a ProgramShape. The layouts of the // parameters and results are set to the default layout. Layouts in the - // ProgramShape are ignored. - explicit ComputationLayout(const ProgramShape& program_shape); + // ProgramShape are ignored if ignore_layouts is true. + explicit ComputationLayout(const ProgramShape& program_shape, + bool ignore_layouts = true); // Returns the layout of a particular parameter. const ShapeLayout& parameter_layout(int64 param_no) const { diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index a5e9aecb9e7..f3da3fc256e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -956,6 +956,14 @@ class HloInstruction { void clear_sharding() { sharding_ = nullptr; } // Return true if this operator has a sharding assigned. bool has_sharding() const { return sharding_ != nullptr; } + // Checks whether the instruction has compatible sharding with the other + // instruction. + bool has_compatible_sharding(const HloInstruction* other) const { + if (!has_sharding()) { + return !other->has_sharding(); + } + return other->has_sharding() ? sharding() == other->sharding() : false; + } // When creating a new instruction which either replaces, or shifts up (kCopy // insertion case), another instruction, we need to make sure the certain diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index 2494569db53..7067b6f86a0 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -31,10 +31,12 @@ limitations under the License. #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" +#include "tensorflow/compiler/xla/service/tuple_simplifier.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -400,9 +402,9 @@ string LayoutConstraints::ToString() const { } Status LayoutAssignment::AddMandatoryConstraints( - const ComputationLayout& computation_layout, - const ChannelLayoutConstraints* channel_constraints, - HloComputation* computation, LayoutConstraints* constraints) { + const ComputationLayout* computation_layout, + ChannelLayoutConstraints* channel_constraints, HloComputation* computation, + LayoutConstraints* constraints) { VLOG(3) << "Adding mandatory layout constraints to computation " << computation->name(); @@ -424,11 +426,16 @@ Status LayoutAssignment::AddMandatoryConstraints( TF_RETURN_IF_ERROR(constraints->SetOperandLayout( instruction->outfeed_shape(), instruction, 0)); } else if (instruction->opcode() == HloOpcode::kParameter) { - // Parameter layouts must match the respective layout in - // ComputationLayout. - shape_with_layout = - &computation_layout.parameter_layout(instruction->parameter_number()) - .shape(); + if (computation_layout != nullptr) { + const ShapeLayout& parameter_layout = + computation_layout->parameter_layout( + instruction->parameter_number()); + if (parameter_layout.LayoutIsSet()) { + // Parameter layouts must match the respective layout in + // ComputationLayout, if there is one. + shape_with_layout = ¶meter_layout.shape(); + } + } } if (shape_with_layout != nullptr) { TF_RETURN_IF_ERROR( @@ -493,9 +500,8 @@ Status LayoutAssignment::AddMandatoryConstraints( HloComputation* body = instruction->while_body(); HloComputation* condition = instruction->while_condition(); const HloInstruction* init = instruction->operand(0); - const ComputationLayout& body_layout = - FindOrDie(computation_layouts_, body); - const ComputationLayout& condition_layout = + ComputationLayout& body_layout = FindOrDie(computation_layouts_, body); + ComputationLayout& condition_layout = FindOrDie(computation_layouts_, condition); // Check a few invariants irrespective of layout. @@ -508,26 +514,19 @@ Status LayoutAssignment::AddMandatoryConstraints( condition_layout.parameter_shape(0))); DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape())); - // Return error if earlier layout assignment of the embedded computations - // has produced conflicting layouts. - if (!ShapeUtil::Equal(body_layout.result_shape(), - body_layout.parameter_shape(0))) { - return InternalError( - "Parameter and result of body computation %s of while instruction " - "%s have different layouts: %s vs %s", - body->name().c_str(), instruction->name().c_str(), - ShapeUtil::HumanString(body_layout.result_shape()).c_str(), - ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str()); + if (body_layout.result_layout() != body_layout.parameter_layout(0)) { + VLOG(2) << "Reset %while body parameter layout: body=" << body->name() + << " while=" << instruction->name() + << " shape=" << body_layout.result_layout().ToString(); + *body_layout.mutable_parameter_layout(0) = body_layout.result_layout(); } - if (!ShapeUtil::Equal(body->root_instruction()->shape(), - condition->parameter_instruction(0)->shape())) { - return InternalError( - "Parameter of condition computation %s of while instruction " - "%s does not match body computation %s result: %s vs %s", - condition->name().c_str(), instruction->name().c_str(), - body->name().c_str(), - ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(), - ShapeUtil::HumanString(body_layout.result_shape()).c_str()); + if (condition_layout.parameter_layout(0) != + body_layout.parameter_layout(0)) { + VLOG(2) << "Reset %while condition parameter layout: cond=" + << condition->name() << " while=" << instruction->name() + << " shape=" << body_layout.parameter_layout(0).ToString(); + *condition_layout.mutable_parameter_layout(0) = + body_layout.parameter_layout(0); } // Constrain the output and the operand of the while instruction to match @@ -557,7 +556,20 @@ Status LayoutAssignment::AddMandatoryConstraints( true_computation_layout.parameter_shape(0))); DCHECK(ShapeUtil::Compatible( false_operand->shape(), false_computation_layout.parameter_shape(0))); - + if (true_computation_layout.result_layout() != + false_computation_layout.result_layout()) { + // We assign layouts in DFS fashion, so the true and false computations + // might have negotiated a different layout. But for the conditional + // instruction POV the layout must match, so we run again on the false + // computation, this time with proper computation layout. + VLOG(2) << "Reset %conditional false computation result layout: " + "false_computation=" + << false_computation->name() + << " conditional=" << instruction->name() << " shape=" + << true_computation_layout.result_layout().ToString(); + *false_computation_layout.mutable_result_layout() = + true_computation_layout.result_layout(); + } TF_RETURN_IF_ERROR(constraints->SetInstructionLayout( true_computation_layout.result_shape(), instruction)); TF_RETURN_IF_ERROR(constraints->SetOperandLayout( @@ -593,10 +605,14 @@ Status LayoutAssignment::AddMandatoryConstraints( } } } - - // Finally set the result layout to match ComputationLayout. - return constraints->SetResultLayout( - computation_layout.result_layout().shape()); + // Finally set the result layout to match ComputationLayout, if there is one. + if (computation_layout != nullptr) { + const ShapeLayout& result_layout = computation_layout->result_layout(); + if (result_layout.LayoutIsSet()) { + TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape())); + } + } + return Status::OK(); } namespace { @@ -760,6 +776,7 @@ StatusOr LayoutAssignment::CreateCopyWithNewLayout( HloInstruction* copy = instruction->parent()->AddInstruction(HloInstruction::CreateUnary( instruction->shape(), HloOpcode::kCopy, instruction)); + RegisterAddedCopy(copy); SetupCopiedInstruction(*instruction, copy, {}); LayoutUtil::ClearLayout(copy->mutable_shape()); TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes( @@ -783,13 +800,19 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer( TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape())); if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) { + VLOG(5) << "Operand " << operand->ToString() << " layout matches in " + << instruction->ToString(); // Operand layout already matches our constraint. Nothing to do. return Status::OK(); } + VLOG(4) << "Operand " << operand->ToString() << " layout does not match " + << operand_layout.ToString() << " in " << instruction->ToString(); TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy, CreateCopyWithNewLayout(operand_layout.shape(), operand)); + VLOG(4) << "New copy of " << operand->ToString() << " is " + << operand_copy->ToString(); return instruction->ReplaceOperandWith(operand_no, operand_copy); } @@ -896,15 +919,16 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) { } } } - - // Finally verify the result layout matches the layout of the entry + // Finally verify the result layout, if set, matches the layout of the entry // computation root. - TF_RET_CHECK(ShapeUtil::Equal( - module->entry_computation()->root_instruction()->shape(), + const ShapeLayout& result_layout = FindOrDie(computation_layouts_, module->entry_computation()) - .result_layout() - .shape())); - + .result_layout(); + if (result_layout.LayoutIsSet()) { + TF_RET_CHECK(ShapeUtil::Equal( + module->entry_computation()->root_instruction()->shape(), + result_layout.shape())); + } return Status::OK(); } @@ -913,18 +937,13 @@ LayoutAssignment::LayoutAssignment( ChannelLayoutConstraints* channel_constraints) : entry_computation_layout_(entry_computation_layout), channel_layout_constraints_(channel_constraints) { - VLOG(1) << "entry computation layout given to layout assignment: " + VLOG(1) << "Entry computation layout given to layout assignment: " << entry_computation_layout_->ToString(); // Layouts of all parameter instructions must be set. for (const ShapeLayout& parameter_layout : entry_computation_layout_->parameter_layouts()) { CHECK(parameter_layout.LayoutIsSet()); } - // If the result layout is not set, then choose the default. - // TODO(b/29118294): Choose a better layout in this case. - if (!entry_computation_layout_->result_layout().LayoutIsSet()) { - entry_computation_layout_->mutable_result_layout()->SetToDefaultLayout(); - } } std::unique_ptr LayoutAssignment::ChooseOperandLayoutFromOutputLayout( @@ -1484,16 +1503,60 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints, return Status::OK(); } +Status LayoutAssignment::CalculateComputationLayout( + HloComputation* computation) { + ComputationLayout computation_layout(computation->ComputeProgramShape(), + /*ignore_layouts=*/false); + InsertOrDie(&computation_layouts_, computation, computation_layout); + VLOG(2) << " Calculated ComputationLayout = " + << computation_layout.ToString(); + return Status::OK(); +} + +Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) { + // Clear existing layouts of the instructions. All layouts must be assigned + // by the LayoutAssignment pass, except for those on infeeds, parameters, + // and the computation result. The latter two are specified in + // computation_layout, so we only need to keep the existing layouts for + // infeeds. Clearing the layouts here avoids hiding potential bugs in the + // layout assignment pass that may accidently use the existing layout. + for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() == HloOpcode::kBitcast) { + // bitcasts are inherently layout sensitive and so a bitcast instruction + // present in the IR before layout assignment is a bug. + return InternalError( + "Unexpected bitcast operation seen during layout assignment: %s.", + instruction->ToString().c_str()); + } + if (instruction->opcode() != HloOpcode::kInfeed) { + LayoutUtil::ClearLayout(instruction->mutable_shape()); + } + } + return Status::OK(); +} + Status LayoutAssignment::RunOnComputation( - const ComputationLayout& computation_layout, + ComputationLayout* computation_layout, const TuplePointsToAnalysis& points_to_analysis, HloComputation* computation, ChannelLayoutConstraints* channel_constraints) { - DCHECK(computation_layout.LayoutIsSet()); - InsertOrDie(&computation_layouts_, computation, computation_layout); VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name() << ")"; - VLOG(2) << " ComputationLayout = " << computation_layout.ToString(); + TF_RETURN_IF_ERROR(ClearComputationLayouts(computation)); + if (computation_layout != nullptr) { + auto it = computation_layouts_.find(computation); + if (it == computation_layouts_.end()) { + VLOG(2) << " New ComputationLayout = " << computation_layout->ToString(); + computation_layouts_.emplace(computation, *computation_layout); + } else { + TF_RET_CHECK(computation_layout == &it->second || + computation_layout == entry_computation_layout_); + VLOG(2) << " Existing ComputationLayout = " + << computation_layout->ToString(); + } + } else { + VLOG(2) << " No ComputationLayout specified (will be calculated)"; + } // Construct LayoutConstraints with all layout constraints of the computation. LayoutConstraints constraints(points_to_analysis, computation); @@ -1536,12 +1599,19 @@ Status LayoutAssignment::RunOnComputation( CHECK_LT(constraints.unconstrained_buffer_ids().size(), unconstrained_count); } - // All logical buffers should have constraints at this point. All that // remains is assign the constraints to the buffers and infer layouts for // aliased buffers. TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation)); + // If the computation layout wasn't specified, now it is the time to compute + // it according to the parameters and root instruction layouts. + // This allows the first pass through this API to record the best flowing + // layout to parameters and root instruction. + if (computation_layout == nullptr) { + TF_RETURN_IF_ERROR(CalculateComputationLayout(computation)); + } + // Record the layouts assigned for any communication ops in // channel_constraints so that they are constrained for future modules. for (HloInstruction* instruction : computation->instructions()) { @@ -1556,6 +1626,34 @@ Status LayoutAssignment::RunOnComputation( return Status::OK(); } +Status LayoutAssignment::PropagateComputationLayouts( + HloComputation* computation, ComputationLayout* computation_layout) { + ComputationLayout computed_computation_layout( + computation->ComputeProgramShape(), + /*ignore_layouts=*/false); + for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) { + ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i); + if (!param_layout->LayoutIsSet()) { + VLOG(4) << "Assigning layout to parameter " << i << " of computation " + << computation->name() << ": " + << computed_computation_layout.parameter_layout(i).ToString(); + *param_layout = computed_computation_layout.parameter_layout(i); + } else { + TF_RET_CHECK(computed_computation_layout.parameter_layout(i) == + *param_layout); + } + } + ShapeLayout* result_layout = computation_layout->mutable_result_layout(); + if (!result_layout->LayoutIsSet()) { + VLOG(4) << "Assigning result layout of computation " << computation->name() + << ": " << computed_computation_layout.result_layout().ToString(); + *result_layout = computed_computation_layout.result_layout(); + } else { + TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout); + } + return Status::OK(); +} + StatusOr LayoutAssignment::Run(HloModule* module) { VLOG(2) << "Running layout assignment on module " << module->name(); XLA_VLOG_LINES(3, module->ToString()); @@ -1564,52 +1662,45 @@ StatusOr LayoutAssignment::Run(HloModule* module) { "before layout assignment", module->config().debug_options()); } + TF_RETURN_IF_ERROR(Init()); - TF_ASSIGN_OR_RETURN(auto points_to_analysis, - TuplePointsToAnalysis::Run(module)); - - // Assign layouts to computations in an order such that a callee computation - // is handled before its caller computation. This ensures that the layout of - // all callers of a computation will agree. - std::list computation_post_order = - module->MakeComputationPostOrder(); - for (auto* computation : module->MakeComputationPostOrder()) { - if (computation->IsFusionComputation()) { - continue; - } - // Clear existing layouts of the instructions. All layouts must be assigned - // by the LayoutAssignment pass, except for those on infeeds, parameters, - // and the computation result. The latter two are specified in - // computation_layout, so we only need to keep the existing layouts for - // infeeds. Clearing the layouts here avoids hiding potential bugs in the - // layout assignment pass that may accidently use the existing layout. - for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kBitcast) { - // bitcasts are inherently layout sensitive and so a bitcast instruction - // present in the IR before layout assignment is a bug. - return InternalError( - "Unexpected bitcast operation seen during layout assignment: %s.", - instruction->ToString().c_str()); + // We do two passes. The first one we pass a nullptr ComputationLayout to + // the RunOnComputation() calls (for non entry computations), and we register + // the ComputationLayout which are naturally flowing in DFS fashion to the + // parameters and root instruction. + // Walking in DFS mode though, means that we can end up with incorrect layouts + // when seen from an outer instruction, which has across-computation + // constraints to impose. + // For example, the kWhile instruction needs to enforce the same layouts for + // the parameters and root of the bosy, as well as the condition parameters. + // Similarly, the kConditional instruction needs to enforce the same layouts + // for the root of the true and false computations. + // So in the first pass, while allowing the layouts to flow to parameters and + // root, we also fix up the eventually inconsistent ComputationLayout, which + // will be then made mandatory by the second pass. + for (int64 i = 0; i < 2; ++i) { + TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module)); + TF_ASSIGN_OR_RETURN(auto points_to_analysis, + TuplePointsToAnalysis::Run(module)); + for (auto* computation : module->MakeComputationPostOrder()) { + if (computation->IsFusionComputation()) { + continue; } - if (instruction->opcode() != HloOpcode::kInfeed) { - LayoutUtil::ClearLayout(instruction->mutable_shape()); + if (computation == module->entry_computation()) { + TF_RETURN_IF_ERROR(RunOnComputation( + entry_computation_layout_, *points_to_analysis, + module->entry_computation(), channel_layout_constraints_)); + } else { + ComputationLayout* computation_layout = + (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation); + TF_RETURN_IF_ERROR(RunOnComputation(computation_layout, + *points_to_analysis, computation, + channel_layout_constraints_)); } } - if (computation == module->entry_computation()) { - TF_RETURN_IF_ERROR(RunOnComputation( - *entry_computation_layout_, *points_to_analysis, - module->entry_computation(), channel_layout_constraints_)); - } else { - ComputationLayout computation_layout(computation->ComputeProgramShape()); - // Setting all embedded computations to the default layout is potentially - // suboptimal. - computation_layout.SetToDefaultLayout(); - TF_RETURN_IF_ERROR(RunOnComputation(computation_layout, - *points_to_analysis, computation, - channel_layout_constraints_)); - } } - + TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(), + entry_computation_layout_)); TF_RETURN_IF_ERROR(CheckLayouts(module)); VLOG(3) << "After layout assignment:"; @@ -1619,9 +1710,54 @@ StatusOr LayoutAssignment::Run(HloModule* module) { "after layout assignment", module->config().debug_options()); } - // All layouts are reset then reassigned by this pass. return true; } +Status LayoutAssignment::Init() { + computation_layouts_.clear(); + return Status::OK(); +} + +Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) { + // Clear all the copies which have been added, and all the related + // instructions (like GTE and tuples). + int64 removed_copies = 0; + for (HloComputation* computation : module->computations()) { + for (HloInstruction* instruction : + computation->MakeInstructionPostOrder()) { + if (instruction->opcode() == HloOpcode::kCopy && + added_copies_.count(instruction) > 0) { + VLOG(5) << "Removing added copy: " << instruction->ToString(); + TF_RETURN_IF_ERROR( + instruction->ReplaceAllUsesWith(instruction->mutable_operand(0))); + TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction)); + ++removed_copies; + } + } + } + added_copies_.clear(); + if (removed_copies > 0) { + TupleSimplifier tuple_simplifier; + HloDCE dce; + TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status()); + TF_RETURN_IF_ERROR(dce.Run(module).status()); + } + return Status::OK(); +} + +Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction, + int64 operand_number) { + HloInstruction* operand = instruction->mutable_operand(operand_number); + if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) { + HloInstruction* copy = + instruction->parent()->AddInstruction(HloInstruction::CreateUnary( + operand->shape(), HloOpcode::kCopy, operand)); + SetupCopiedInstruction(*operand, copy, {}); + LayoutUtil::ClearLayout(copy->mutable_shape()); + TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy)); + } + return Status::OK(); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h index ae4986d6ad9..8b4e07995af 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.h +++ b/tensorflow/compiler/xla/service/layout_assignment.h @@ -39,6 +39,7 @@ limitations under the License. #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -362,12 +363,15 @@ class LayoutAssignment : public HloPassInterface { int64 operand_no); private: + // Initializes the layout assignment object for a new Run() call. + Status Init(); + // Adds constraints which must be satisfied for correctness on all // backends. Called once prior to propagating constraints. - Status AddMandatoryConstraints( - const ComputationLayout& computation_layout, - const ChannelLayoutConstraints* channel_constraints, - HloComputation* computation, LayoutConstraints* constraints); + Status AddMandatoryConstraints(const ComputationLayout* computation_layout, + ChannelLayoutConstraints* channel_constraints, + HloComputation* computation, + LayoutConstraints* constraints); // This method can be overridden to add backend-specific constraints to the // layout of the instructions of a computation. This method is called after @@ -378,10 +382,12 @@ class LayoutAssignment : public HloPassInterface { } // Construct contraints and assign layouts to all instructions in the - // computation satisfying the given ComputationLayout. Layouts constraints are - // added, then propagated until all LogicalBuffers in the computation are - // constrained. - Status RunOnComputation(const ComputationLayout& computation_layout, + // computation satisfying the given ComputationLayout, if not nullptr. + // Otherwise the ComputationLayout will be calculated by propagating the + // computation instruction contraints. + // Layouts constraints are added, then propagated until all LogicalBuffers in + // the computation are constrained. + Status RunOnComputation(ComputationLayout* computation_layout, const TuplePointsToAnalysis& points_to_analysis, HloComputation* computation, ChannelLayoutConstraints* channel_constraints); @@ -402,6 +408,25 @@ class LayoutAssignment : public HloPassInterface { // necessary conditions. Status CheckLayouts(HloModule* module); + // Computes the ComputationLayout of the given computation based of the + // layouts assigned to parameters and root instruction, and inserts it to the + // computation_layouts_ map. + Status CalculateComputationLayout(HloComputation* computation); + + // Clears all the layouts which can be cleared within a computation. + Status ClearComputationLayouts(HloComputation* computation); + + // Clears the side effects of a previous pass, like added copy instructions. + Status ClearPreviousPassSideEffects(HloModule* module); + + // Propagates the layouts computed by the layout assignment pass on the given + // computation, to the computation layout passed in to this API. + // This API propagates missing layout, and also checks that the caller + // specified have been respected, by comparing those with the parameters and + // root computation instruction. + Status PropagateComputationLayouts(HloComputation* computation, + ComputationLayout* computation_layout); + ComputationLayout* entry_computation_layout_; protected: @@ -418,21 +443,37 @@ class LayoutAssignment : public HloPassInterface { // Creates and returns a copy of the given instruction with a different // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple // instruction producing the copy is returned. - static StatusOr CreateCopyWithNewLayout( + StatusOr CreateCopyWithNewLayout( const Shape& shape_with_layout, HloInstruction* instruction); // Creates a copy of the given operand if the operand's layout does not match // the given layout. This copy replaces the use in the given instruction. // Tuple operands will be deep-copied. - static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout, - HloInstruction* instruction, - int64 operand_no); + Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout, + HloInstruction* instruction, + int64 operand_no); + + // Registers a copy instruction added by the layout assignment pass. + void RegisterAddedCopy(HloInstruction* copy) { + CHECK_EQ(copy->opcode(), HloOpcode::kCopy); + added_copies_.insert(copy); + } + + // Adds a copy for the operand of an instruction, unless such operand is + // already a copy, and has a single user (which is forcibly the instruction + // itself). + Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number); // Map containing the layouts of all computations assigned so // far. Computations are handled in a topological sort where computations are // handled before their caller instructions so the layouts of caller // instructions can be set to match the computation. std::map computation_layouts_; + + // Every copy added to the module by the layout assignment pass is registered + // here. + tensorflow::gtl::FlatSet added_copies_; + ChannelLayoutConstraints* channel_layout_constraints_; }; diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 39f3aefdf80..a73118c68a7 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -308,7 +308,10 @@ StatusOr> Service::CreateModuleConfig( computation_layout->mutable_result_layout()->CopyLayoutFromShape( shape_with_output_layout)); } else { - computation_layout->mutable_result_layout()->Clear(); + // TODO(b/78356948): We are forcing the default layout here. We should fix + // clients which expect a default layout, to be explicit about it, by + // passing the proper ExecutionOptions with shape_with_output_layout set. + computation_layout->mutable_result_layout()->SetToDefaultLayout(); } config->set_replica_count(options_.number_of_replicas()); diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc index 113c2e2bd9f..d668855084a 100644 --- a/tensorflow/compiler/xla/service/tuple_simplifier.cc +++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc @@ -69,6 +69,7 @@ StatusOr TupleSimplifier::Run(HloModule* module) { // Tuple // HloInstruction* top_tuple = nullptr; + HloInstruction* first_gte = nullptr; bool can_simplify = true; for (int64 operand_number = 0; operand_number < instruction->operand_count(); ++operand_number) { @@ -78,11 +79,17 @@ StatusOr TupleSimplifier::Run(HloModule* module) { can_simplify = false; break; } - + if (first_gte == nullptr) { + first_gte = operand; + } else if (!first_gte->has_compatible_sharding(operand)) { + can_simplify = false; + break; + } if (top_tuple == nullptr) { top_tuple = operand->mutable_operand(0); if (!ShapeUtil::Compatible(top_tuple->shape(), - instruction->shape())) { + instruction->shape()) || + !instruction->has_compatible_sharding(top_tuple)) { can_simplify = false; break; } @@ -108,15 +115,17 @@ StatusOr TupleSimplifier::Run(HloModule* module) { // | // GTE if (instruction->operand(0)->opcode() == HloOpcode::kTuple) { - changed = true; HloInstruction* element_source = instruction->mutable_operand(0)->mutable_operand( instruction->tuple_index()); - TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source)); - for (HloInstruction* user : element_source->users()) { - if (user->opcode() == HloOpcode::kTuple || - user->opcode() == HloOpcode::kGetTupleElement) { - worklist.push(user); + if (instruction->has_compatible_sharding(element_source)) { + changed = true; + TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source)); + for (HloInstruction* user : element_source->users()) { + if (user->opcode() == HloOpcode::kTuple || + user->opcode() == HloOpcode::kGetTupleElement) { + worklist.push(user); + } } } } From 6af31f6260161bab02db83d7e9e1d7ba7fd14b2c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 15:20:37 -0700 Subject: [PATCH 0550/1734] [XLA] Redesign: add comparator and printer for the XlaOp. This is to prepare the migration of tf2xla. There were some codes used ComputationDataHandle::handle() for comparison/printing. Now implement XlaOp's comparator and printer. PiperOrigin-RevId: 193731437 --- .../compiler/xla/client/xla_client/xla_builder.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h index 5977ee4f4bf..4955f1515d6 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h @@ -57,11 +57,27 @@ class XlaOp { StatusOr GetShape() const; + const XlaBuilder* builder() const { return builder_; } + + bool operator==(const XlaOp& rhs) const { + return handle_ == rhs.handle_ && builder_ == rhs.builder_; + } + + bool operator!=(const XlaOp& rhs) const { + return handle_ != rhs.handle_ || builder_ != rhs.builder_; + } + + friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) { + out << op.handle(); + return out; + } + private: XlaOp(int64 handle, XlaBuilder* builder) : handle_(handle), builder_(builder) {} int64 handle() const { return handle_; } + friend class XlaBuilder; int64 handle_; From cadbb0b70b9441388a04533433245ac85f2887a9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 15:32:32 -0700 Subject: [PATCH 0551/1734] [XLA] Redesign: implement DumpToDirectory for the HloSession. This is to prepare the migration of tf2xla. PiperOrigin-RevId: 193733029 --- tensorflow/compiler/xla/service/BUILD | 1 + tensorflow/compiler/xla/service/executable.cc | 20 +++++++++++++++++++ tensorflow/compiler/xla/service/executable.h | 5 +++++ 3 files changed, 26 insertions(+) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index bc577c173d6..afb344e5ae2 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -755,6 +755,7 @@ cc_library( ":hlo", ":hlo_execution_profile", ":hlo_graph_dumper", + ":hlo_proto", ":pool", ":session_proto", ":shaped_buffer", diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index b097ef79cc6..8218b5f7c87 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -163,4 +163,24 @@ Status Executable::DumpSessionModule() { result); } +/* static */ Status Executable::DumpToDirectory(const string& directory_path, + string filename, + const HloSession& hlo_session) { + tensorflow::Env* env = tensorflow::Env::Default(); + if (!env->IsDirectory(directory_path).ok()) { + // NB! CreateDir does not work reliably with multiple XLA threads -- two + // threads can race to observe the absence of the dump directory and + // simultaneously try to create it, causing the "losing" thread to get a + // "directory already exists" error. + TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path)); + } + filename = SanitizeFileName(std::move(filename)); + string file_path = tensorflow::io::JoinPath(directory_path, filename); + string result; + TF_RET_CHECK( + tensorflow::SerializeToStringDeterministic(hlo_session, &result)); + return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path, + result); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 9c725f21d80..bdbe119120f 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -155,6 +156,10 @@ class Executable { static Status DumpToDirectory(const string& directory_path, string filename, const SessionModule& session_module); + // Dump hlo_session to directory_path/filename. + static Status DumpToDirectory(const string& directory_path, string filename, + const HloSession& hlo_session); + protected: mutable tensorflow::mutex mutex_; From b2f786867dca85b6b848f09f2c1d40dd123fc0fc Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 20 Apr 2018 15:38:06 -0700 Subject: [PATCH 0552/1734] Always use the local worker name in CreateWorkerSession when not doing ClusterSpec propagation. Previously, the master would send a job name and task index in an otherwise-empty ServerDef, and the worker would unquestioningly use those to build its worker name. However, this would lead to errors if the worker had a local name like "/job:worker/replica:1/task:0", because the ServerDef doesn't support non-zero replica IDs, and so the local worker would end up an inconsistent view of what its worker name should be. In particular `WorkerSession::worker_name` would disagree with the device names added during graph partitioning by the master, which would lead to runtime failures ("InvalidArgumentError: Invalid rendezvous key"). PiperOrigin-RevId: 193733855 --- tensorflow/core/distributed_runtime/BUILD | 1 + .../distributed_runtime/master_session.cc | 28 +++++++++--------- .../core/distributed_runtime/session_mgr.cc | 6 ++-- .../distributed_runtime/session_mgr_test.cc | 29 +++++++++++++++++++ 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index d564727da50..343dd5d4560 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -145,6 +145,7 @@ tf_cc_test( deps = [ ":session_mgr", ":worker_env", + "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr", diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index ebe350d313d..e3022f38a24 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -1219,17 +1219,6 @@ Status MasterSession::CreateWorkerSessions( workers[i].name = &worker_names[i]; workers[i].worker = worker_cache->CreateWorker(worker_names[i]); workers[i].request.set_session_handle(handle_); - if (options.cluster_def) { - *workers[i].request.mutable_server_def()->mutable_cluster() = - *options.cluster_def; - workers[i].request.mutable_server_def()->set_protocol(*options.protocol); - // Session state is always isolated when ClusterSpec propagation - // is in use. - workers[i].request.set_isolate_session_state(true); - } else { - workers[i].request.set_isolate_session_state( - session_opts_.config.isolate_session_state()); - } DeviceNameUtils::ParsedName name; if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) { @@ -1243,8 +1232,21 @@ Status MasterSession::CreateWorkerSessions( return status; } - workers[i].request.mutable_server_def()->set_job_name(name.job); - workers[i].request.mutable_server_def()->set_task_index(name.task); + if (options.cluster_def) { + *workers[i].request.mutable_server_def()->mutable_cluster() = + *options.cluster_def; + workers[i].request.mutable_server_def()->set_protocol(*options.protocol); + workers[i].request.mutable_server_def()->set_job_name(name.job); + workers[i].request.mutable_server_def()->set_task_index(name.task); + // Session state is always isolated when ClusterSpec propagation + // is in use. + workers[i].request.set_isolate_session_state(true); + } else { + // NOTE(mrry): Do not set any component of the ServerDef, + // because the worker will use its local configuration. + workers[i].request.set_isolate_session_state( + session_opts_.config.isolate_session_state()); + } } for (size_t i = 0; i < worker_names.size(); ++i) { diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc index 357e9f8930f..7ef4206c780 100644 --- a/tensorflow/core/distributed_runtime/session_mgr.cc +++ b/tensorflow/core/distributed_runtime/session_mgr.cc @@ -43,6 +43,7 @@ SessionMgr::SessionMgr( new GraphMgr(worker_env, worker_env->device_mgr)))), worker_cache_factory_(std::move(worker_cache_factory)) {} +/* static */ string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) { return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:", server_def.task_index()); @@ -56,13 +57,14 @@ Status SessionMgr::CreateSession(const string& session, return errors::InvalidArgument("Session must be non-empty."); } - const string worker_name = WorkerNameFromServerDef(server_def); - WorkerCacheInterface* worker_cache = nullptr; + string worker_name; if (server_def.cluster().job().empty()) { worker_cache = new WorkerCacheWrapper(default_worker_cache_.get()); + worker_name = legacy_session_->worker_name; } else { TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache)); + worker_name = WorkerNameFromServerDef(server_def); } if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) { diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc index 0da333833ad..99192119a63 100644 --- a/tensorflow/core/distributed_runtime/session_mgr_test.cc +++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/worker_env.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/protobuf/cluster.pb.h" namespace tensorflow { @@ -77,6 +78,34 @@ TEST_F(SessionMgrTest, CreateSessionSimple) { TF_EXPECT_OK(mgr_.DeleteSession(session_handle)); } +TEST_F(SessionMgrTest, CreateSessionClusterDefWorkerName) { + ServerDef server_def; + server_def.set_job_name("worker"); + server_def.set_task_index(3); + auto job = server_def.mutable_cluster()->add_job(); + job->set_name("worker"); + job->mutable_tasks()->insert({3, "localhost:3333"}); + + string session_handle = "test_session_handle"; + TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true)); + std::shared_ptr session; + TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session)); + EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null"; + EXPECT_EQ("/job:worker/replica:0/task:3", session->worker_name); + TF_EXPECT_OK(mgr_.DeleteSession(session_handle)); +} + +TEST_F(SessionMgrTest, CreateSessionDefaultWorkerName) { + ServerDef server_def; + string session_handle = "test_session_handle"; + TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true)); + std::shared_ptr session; + TF_EXPECT_OK(mgr_.WorkerSessionForSession(session_handle, &session)); + EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null"; + EXPECT_EQ("/job:mnist/replica:0/task:0", session->worker_name); + TF_EXPECT_OK(mgr_.DeleteSession(session_handle)); +} + TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) { ServerDef server_def; server_def.set_job_name("worker"); From c015a45646029f8c116028505f2da9e023b5c2b7 Mon Sep 17 00:00:00 2001 From: Brennan Saeta Date: Fri, 20 Apr 2018 15:51:16 -0700 Subject: [PATCH 0553/1734] Support legacy clusters PiperOrigin-RevId: 193735742 --- .../cluster_resolver/python/training/tpu_cluster_resolver.py | 2 +- .../python/training/tpu_cluster_resolver_test.py | 3 +-- tensorflow/contrib/tpu/python/tpu/tpu_config.py | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py index 5a2771229d9..1403483d287 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py @@ -245,7 +245,7 @@ class TPUClusterResolver(ClusterResolver): else: if not self._tpu.startswith(compat.as_bytes('grpc://')): # Case 3. - return server_lib.ClusterSpec({}) + return None # Case 2. cluster_spec = {self._job_name: [self._tpu[len( compat.as_bytes('grpc://')):]]} diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py index dff7a03b684..5b3f9be5a11 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py @@ -356,8 +356,7 @@ class TPUClusterResolverTest(test.TestCase): tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar') self.assertEqual( compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master()) - self.assertEqual( - server_lib.ClusterSpec({}), tpu_cluster_resolver.cluster_spec()) + self.assertEqual(None, tpu_cluster_resolver.cluster_spec()) def testGkeEnvironment(self): os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470' diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py index cc1a7fd8015..6d7331e3c79 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py @@ -210,8 +210,9 @@ class RunConfig(run_config_lib.RunConfig): raise ValueError( 'You cannot provide a ClusterResolver and ' 'session_config.cluster_def.') - self._session_config.cluster_def.CopyFrom( - self._cluster_spec.as_cluster_def()) + if self._cluster_spec: + self._session_config.cluster_def.CopyFrom( + self._cluster_spec.as_cluster_def()) @property def evaluation_master(self): From a0071844d0af47f22ab512363b56383acf762dff Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 16:05:47 -0700 Subject: [PATCH 0554/1734] Remove protected data members from GraphOptimizerStage. PiperOrigin-RevId: 193737654 --- .../optimizers/arithmetic_optimizer.cc | 54 +++++++++---------- .../optimizers/graph_optimizer_stage.h | 5 +- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc index 232132e1e8f..ed199c1ac8b 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc @@ -294,8 +294,8 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage { for (int i = src->input_size() - 1; i >= 0; --i) { if (IsControlInput(src->input(i))) { *target_node->add_input() = src->input(i); - ctx_.node_map->AddOutput(NodeName(src->input(i)), - target_node->name()); + ctx().node_map->AddOutput(NodeName(src->input(i)), + target_node->name()); } else { break; } @@ -442,7 +442,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage { // TODO(ezhulenev): move to GraphOptimizerStage? bool DrivesControlDependency(const NodeDef& node) const { int position; - for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) { + for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) { for (int i = 0; i < output->input_size(); ++i) { auto input = output->input(i); string name = ParseNodeName(input, &position); @@ -476,8 +476,8 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage { } bool IsInPreserveSet(const NodeDef& node) const { - return ctx_.nodes_to_preserve->find(node.name()) != - ctx_.nodes_to_preserve->end(); + return ctx().nodes_to_preserve->find(node.name()) != + ctx().nodes_to_preserve->end(); } bool IsAlreadyOptimized(const NodeDef& node) const { @@ -546,7 +546,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage { // with a single output data consumer (presumably if we reach this node from // previously absorbed or a root node, it means that this node is not used // as an input to any other op, outside of the group) - if (NumNonControlDataOutputs(node, *ctx_.node_map) != 1) { + if (NumNonControlDataOutputs(node, *ctx().node_map) != 1) { return false; } // All input shapes must be broadcastable to the node shape @@ -685,7 +685,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage { (*node->mutable_attr())["N"].set_i(inputs.size()); for (const auto& inputAndShape : inputs) { - ctx_.node_map->AddOutput(inputAndShape.input, node_name); + ctx().node_map->AddOutput(inputAndShape.input, node_name); node->add_input(inputAndShape.input); } @@ -707,8 +707,8 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage { node->set_device(root_node.device()); (*node->mutable_attr())["T"].set_type(dtype); - ctx_.node_map->AddOutput(left.input, node_name); - ctx_.node_map->AddOutput(right.input, node_name); + ctx().node_map->AddOutput(left.input, node_name); + ctx().node_map->AddOutput(right.input, node_name); node->add_input(left.input); node->add_input(right.input); @@ -784,20 +784,20 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage { new_outer_node->set_input(1, new_add_node->name()); } - ctx_.node_map->AddOutput(common_factor, new_outer_node->name()); - ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name()); + ctx().node_map->AddOutput(common_factor, new_outer_node->name()); + ctx().node_map->AddOutput(new_add_node->name(), new_outer_node->name()); // Hoist non-shared factors up into the new AddN node. for (int i = 0; i < unique_factors.size(); ++i) { const string& unique_factor_i = unique_factors[i]; new_add_node->set_input(i, unique_factor_i); - ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name()); + ctx().node_map->AddOutput(unique_factor_i, new_add_node->name()); } // Add control deps on add node for (const string& ctrl_dep : ctrl_deps) { *new_add_node->add_input() = ctrl_dep; - ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name()); + ctx().node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name()); } // optimize new inner aggregation node @@ -931,8 +931,8 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage { // if graph rewrite happens in multiple passes without graph pruning between // them, it's possible that rewritten node already exists in a graph return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() || - ctx_.node_map->NodeExists(OuterNodeName(node, false)) || - ctx_.node_map->NodeExists(OuterNodeName(node, true)); + ctx().node_map->NodeExists(OuterNodeName(node, false)) || + ctx().node_map->NodeExists(OuterNodeName(node, true)); } // keep names of the nodes that were optimized by this stage @@ -996,7 +996,7 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage { } // Optimized nodes updated in place, and that would break the graph, if the // node has multiple output consumers - if (NumNonControlOutputs(node, *ctx_.node_map) != 1) { + if (NumNonControlOutputs(node, *ctx().node_map) != 1) { return false; } // All input shapes must be broadcastable to the node shape @@ -1120,13 +1120,13 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage { node->set_input(0, input_0); node->set_input(1, input_1); // Invalidate node properties (shape) - ctx_.graph_properties->ClearOutputProperties(node->name()); - ctx_.graph_properties->ClearInputProperties(node->name()); + ctx().graph_properties->ClearOutputProperties(node->name()); + ctx().graph_properties->ClearInputProperties(node->name()); // Update the node map - ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name()); - ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name()); - ctx_.node_map->AddOutput(NodeName(input_0), node->name()); - ctx_.node_map->AddOutput(NodeName(input_1), node->name()); + ctx().node_map->RemoveOutput(NodeName(old_input_0), node->name()); + ctx().node_map->RemoveOutput(NodeName(old_input_1), node->name()); + ctx().node_map->AddOutput(NodeName(input_0), node->name()); + ctx().node_map->AddOutput(NodeName(input_1), node->name()); // Add updated node to optimization queue AddToOptimizationQueue(node); } @@ -1257,8 +1257,8 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage { // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2) bitcast->set_input(0, operand->input(0)); SetSourceDataType(GetSourceDataType(*operand), bitcast); - ctx_.node_map->UpdateInput(bitcast->name(), bitcast->input(0), - operand->input(0)); + ctx().node_map->UpdateInput(bitcast->name(), bitcast->input(0), + operand->input(0)); AddToOptimizationQueue(bitcast); *simplified_node_name = bitcast->name(); } @@ -1313,14 +1313,14 @@ class RemoveNegationStage : public ArithmeticOptimizerStage { node->mutable_input()->SwapElements(0, 1); node->set_input(1, x->input(0)); node->add_input(AsControlDependency(x->name())); - ctx_.node_map->AddOutput(NodeName(x->input(0)), node_name); + ctx().node_map->AddOutput(NodeName(x->input(0)), node_name); updated = true; } else if (IsNeg(*y)) { // a + (-b) = a - b node->set_op("Sub"); node->set_input(1, y->input(0)); node->add_input(AsControlDependency(y->name())); - ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name); + ctx().node_map->AddOutput(NodeName(y->input(0)), node_name); updated = true; } } else if (IsSub(*node)) { @@ -1329,7 +1329,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage { node->set_op("Add"); node->set_input(1, y->input(0)); node->add_input(AsControlDependency(y->name())); - ctx_.node_map->AddOutput(NodeName(y->input(0)), node_name); + ctx().node_map->AddOutput(NodeName(y->input(0)), node_name); updated = true; } } diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h index ed398525f3c..089cad36e9a 100644 --- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h +++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h @@ -182,7 +182,10 @@ class GraphOptimizerStage { return ::tensorflow::grappler::AddEmptyNode(ctx_, name); } - protected: // Data members + protected: + const GraphOptimizerContext& ctx() const { return ctx_; } + + private: // Data members const string optimizer_name_; const string stage_name_; const GraphOptimizerContext ctx_; From 3fa8795c511931b55a9703956bdf564fde817c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Branchaud-Charron?= Date: Fri, 20 Apr 2018 19:10:41 -0400 Subject: [PATCH 0555/1734] Fix casting in Keras estimator (#18104) --- .../python/keras/_impl/keras/estimator.py | 22 +++++++++++++---- .../keras/_impl/keras/estimator_test.py | 24 +++++++++++++++---- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py index b922a6c6839..c3c3fceb454 100644 --- a/tensorflow/python/keras/_impl/keras/estimator.py +++ b/tensorflow/python/keras/_impl/keras/estimator.py @@ -29,12 +29,14 @@ from tensorflow.python.estimator import run_config as run_config_lib from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib +from tensorflow.python.framework import tensor_util from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import models from tensorflow.python.keras._impl.keras import optimizers from tensorflow.python.keras._impl.keras.engine.base_layer import Layer from tensorflow.python.keras._impl.keras.engine.network import Network from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope +from tensorflow.python.ops import check_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import metrics as metrics_module from tensorflow.python.ops import variables as variables_module @@ -55,6 +57,17 @@ def _cast_tensor_to_floatx(x): return math_ops.cast(x, K.floatx()) +def _convert_tensor(x): + """Create or cast tensor if needed.""" + if not tensor_util.is_tensor(x): + # x is a numpy array + x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x) + if check_ops.is_numeric_tensor(x): + # is_numeric_tensor returns False if provided with a numpy array + x = _cast_tensor_to_floatx(x) + return x + + def _any_variable_initalized(): """Check if any variable has been initialized in the Keras model. @@ -86,7 +99,7 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True): if isinstance(estimator_io, (list, tuple)): # Case currently not supported by most built-in input_fn, # but it's good to have for sanity - return [_cast_tensor_to_floatx(x) for x in estimator_io] + return [_convert_tensor(x) for x in estimator_io] elif isinstance(estimator_io, dict): if is_input: if keras_model._is_graph_network: @@ -108,12 +121,12 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True): 'It needs to match one ' 'of the following: %s' % ('input' if is_input else 'output', key, ', '.join(keras_io_names))) - tensors = [_cast_tensor_to_floatx(estimator_io[io_name]) + tensors = [_convert_tensor(estimator_io[io_name]) for io_name in keras_io_names] return tensors else: # Plain array. - return _cast_tensor_to_floatx(estimator_io) + return _convert_tensor(estimator_io) def _in_place_subclassed_model_reset(model): @@ -274,8 +287,7 @@ def _clone_and_build_model(mode, is_input=False) else: target_tensors = [ - _cast_tensor_to_floatx( - sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels)) + _convert_tensor(labels) ] if keras_model._is_graph_network: diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py index 653cdc01e24..80fa87d0410 100644 --- a/tensorflow/python/keras/_impl/keras/estimator_test.py +++ b/tensorflow/python/keras/_impl/keras/estimator_test.py @@ -30,6 +30,7 @@ from tensorflow.python.estimator.inputs import numpy_io from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.keras._impl import keras +from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import testing_utils from tensorflow.python.keras._impl.keras.applications import mobilenet from tensorflow.python.keras._impl.keras.optimizers import SGD @@ -142,16 +143,20 @@ def randomize_io_type(array, name): def multi_inputs_multi_outputs_model(): - # test multi-input layer a = keras.layers.Input(shape=(16,), name='input_a') b = keras.layers.Input(shape=(16,), name='input_b') + m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m') dense = keras.layers.Dense(8, name='dense_1') + a_2 = dense(a) + # Apply a mask + s_2 = keras.layers.Lambda(lambda k: + K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2]) b_2 = dense(b) - merged = keras.layers.concatenate([a_2, b_2], name='merge') + merged = keras.layers.concatenate([s_2, b_2], name='merge') c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged) d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged) - model = keras.models.Model(inputs=[a, b], outputs=[c, d]) + model = keras.models.Model(inputs=[a, b, m], outputs=[c, d]) model.compile( loss='categorical_crossentropy', optimizer='rmsprop', @@ -352,18 +357,27 @@ class TestKerasEstimator(test_util.TensorFlowTestCase): test_samples=50, input_shape=(16,), num_classes=2) + np.random.seed(_RANDOM_SEED) + (input_m_train, _), (input_m_test, _) = testing_utils.get_test_data( + train_samples=_TRAIN_SIZE, + test_samples=50, + input_shape=(8,), + num_classes=2) + c_train = keras.utils.to_categorical(c_train) c_test = keras.utils.to_categorical(c_test) d_train = keras.utils.to_categorical(d_train) d_test = keras.utils.to_categorical(d_test) def train_input_fn(): - input_dict = {'input_a': a_train, 'input_b': b_train} + input_dict = {'input_a': a_train, 'input_b': b_train, + 'input_m': input_m_train > 0} output_dict = {'dense_2': c_train, 'dense_3': d_train} return input_dict, output_dict def eval_input_fn(): - input_dict = {'input_a': a_test, 'input_b': b_test} + input_dict = {'input_a': a_test, 'input_b': b_test, + 'input_m': input_m_test > 0} output_dict = {'dense_2': c_test, 'dense_3': d_test} return input_dict, output_dict From cd095e0c455b3df98841ca70ba24fd41935552e7 Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Fri, 20 Apr 2018 16:18:29 -0700 Subject: [PATCH 0556/1734] tf.contrib.data.scan: Support eager execution. PiperOrigin-RevId: 193739234 --- .../contrib/data/python/kernel_tests/BUILD | 1 + .../kernel_tests/scan_dataset_op_test.py | 23 ++++++++++++------- .../contrib/data/python/ops/scan_ops.py | 1 + 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index 05a4f5028ab..9d1e8b20c2a 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -343,6 +343,7 @@ py_test( "//tensorflow/python:dtypes", "//tensorflow/python:errors", "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/eager:context", "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py index e0494736b72..1a97a84b2cb 100644 --- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py @@ -24,9 +24,11 @@ import numpy as np from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base from tensorflow.contrib.data.python.ops import scan_ops from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + @test_util.run_in_graph_and_eager_modes() def testFibonacci(self): iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply( scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])) ).make_one_shot_iterator() - next_element = iterator.get_next() - with self.test_session() as sess: - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(2, sess.run(next_element)) - self.assertEqual(3, sess.run(next_element)) - self.assertEqual(5, sess.run(next_element)) - self.assertEqual(8, sess.run(next_element)) + if context.executing_eagerly(): + next_element = iterator.get_next + else: + get_next = iterator.get_next() + next_element = lambda: get_next + + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(2, self.evaluate(next_element())) + self.assertEqual(3, self.evaluate(next_element())) + self.assertEqual(5, self.evaluate(next_element())) + self.assertEqual(8, self.evaluate(next_element())) def testChangingStateShape(self): # Test the fixed-point shape invariant calculations: start with diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py index 1c88366273f..711a538697a 100644 --- a/tensorflow/contrib/data/python/ops/scan_ops.py +++ b/tensorflow/contrib/data/python/ops/scan_ops.py @@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset): weakened_state_shapes) self._scan_func = tf_scan_func + self._scan_func.add_to_graph(ops.get_default_graph()) def _as_variant_tensor(self): input_t = self._input_dataset._as_variant_tensor() # pylint: disable=protected-access From 8d3a41f459b776856ff668bb076d4bc449927e09 Mon Sep 17 00:00:00 2001 From: Yunxing Dai Date: Fri, 20 Apr 2018 16:30:02 -0700 Subject: [PATCH 0557/1734] [XLA] Remove constant cast in literal util. It's not portable to modify an underlying char array of a c++ string object: (https://stackoverflow.com/questions/5729203/modifying-underlying-char-array-of-a-c-string-object) RELNOTES: n/a PiperOrigin-RevId: 193740595 --- tensorflow/compiler/xla/literal_util.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index c315b4ff300..bb6dd4f9098 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -44,8 +44,16 @@ namespace { constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; -// Converts between little and big endian, assuming elements in the array are 16 -// bits long. +// Converts between little and big endian. +// +// Precondition: size % 2 == 0 (elements in the array are 16 bits long) +void ConvertEndianShort(string* bytes) { + CHECK_EQ(bytes->size() / 2, 0); + for (int64 i = 0; i < bytes->size(); i += 2) { + std::swap((*bytes)[i], (*bytes)[i + 1]); + } +} + void ConvertEndianShort(char* bytes, int64 size) { CHECK_EQ(size / 2, 0); for (int64 i = 0; i < size; i += 2) { @@ -1930,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const { *proto->mutable_f16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_f16s()->data()), - proto->f16s().size()); + ConvertEndianShort(proto->mutable_f16s()); } break; case BF16: *proto->mutable_bf16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_bf16s()->data()), - proto->bf16s().size()); + ConvertEndianShort(proto->mutable_bf16s()); } break; case F32: From 16f0a5bb2aed8d0e605004b421a9cd6f32e37f94 Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Fri, 20 Apr 2018 16:48:44 -0700 Subject: [PATCH 0558/1734] Java: Bump release to 1.8.0-rc1 PiperOrigin-RevId: 193742798 --- tensorflow/java/maven/libtensorflow/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +- tensorflow/java/maven/pom.xml | 2 +- tensorflow/java/maven/proto/pom.xml | 2 +- tensorflow/java/maven/tensorflow/pom.xml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml index 9c1601753bd..66985e3b18c 100644 --- a/tensorflow/java/maven/libtensorflow/pom.xml +++ b/tensorflow/java/maven/libtensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 ../ libtensorflow diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml index 3d013e12b0d..34d4ba0b083 100644 --- a/tensorflow/java/maven/libtensorflow_jni/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 ../ libtensorflow_jni diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml index 40e44af1f53..1909d08e41d 100644 --- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 ../ libtensorflow_jni_gpu diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index 82bfd0c73ae..ba98732f5ad 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 pom https://www.tensorflow.org diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml index 0a2775a500c..dee8c343598 100644 --- a/tensorflow/java/maven/proto/pom.xml +++ b/tensorflow/java/maven/proto/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 ../ proto diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml index 61961432a7e..95e024ace97 100644 --- a/tensorflow/java/maven/tensorflow/pom.xml +++ b/tensorflow/java/maven/tensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.8.0-rc0 + 1.8.0-rc1 ../ tensorflow From a722cdf7a62a3ee82ca6ee1b3d33f3d03dba49ee Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Wed, 18 Apr 2018 15:04:21 -0700 Subject: [PATCH 0559/1734] Fix loss computation bug in Model training/eval methods with eager execution enabled. Fixes #18642. PiperOrigin-RevId: 193423288 --- .../_impl/keras/engine/training_eager.py | 2 +- .../_impl/keras/engine/training_eager_test.py | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py index 4cdb5f108a0..695669d9ee1 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py @@ -150,7 +150,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False): weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn( - outs[i], targets[i], weights, mask=mask) + targets[i], outs[i], weights, mask=mask) loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py index 6cdb6b0753f..ed0f91ee1e2 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py @@ -21,6 +21,7 @@ from __future__ import print_function import numpy as np from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util as tf_test_util from tensorflow.python.keras._impl import keras from tensorflow.python.keras._impl.keras import testing_utils from tensorflow.python.platform import test @@ -625,6 +626,30 @@ class LossWeightingTest(test.TestCase): model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) +class CorrectnessTest(test.TestCase): + + @tf_test_util.run_in_graph_and_eager_modes() + def test_loss_correctness(self): + # Test that training loss is the same in eager and graph + # (by comparing it to a reference value in a deterministic case) + model = keras.Sequential() + model.add(keras.layers.Dense(3, + activation='relu', + input_dim=4, + kernel_initializer='ones')) + model.add(keras.layers.Dense(2, + activation='softmax', + kernel_initializer='ones')) + model.compile(loss='sparse_categorical_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001)) + x = np.ones((100, 4)) + np.random.seed(123) + y = np.random.randint(0, 1, size=(100, 1)) + history = model.fit(x, y, epochs=1, batch_size=10) + self.assertEqual( + np.around(history.history['loss'][-1], decimals=4), 0.6173) + + if __name__ == '__main__': ops.enable_eager_execution() test.main() From 0385bfe0726ad9710bfcca145e19611e9e2391bb Mon Sep 17 00:00:00 2001 From: Mustafa Ispir Date: Fri, 20 Apr 2018 17:03:14 -0700 Subject: [PATCH 0560/1734] Let estimators to be used when eager is enabled. PiperOrigin-RevId: 193744371 --- tensorflow/python/estimator/estimator.py | 263 +++++++++--------- tensorflow/python/estimator/estimator_test.py | 1 + 2 files changed, 133 insertions(+), 131 deletions(-) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index 9862fdecdb2..351fcb64232 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -100,10 +100,6 @@ class Estimator(object): None of `Estimator`'s methods can be overridden in subclasses (its constructor enforces this). Subclasses should use `model_fn` to configure the base class, and may add methods implementing specialized functionality. - - @compatibility(eager) - Estimators are not compatible with eager execution. - @end_compatibility """ def __init__(self, model_fn, model_dir=None, config=None, params=None, @@ -166,15 +162,10 @@ class Estimator(object): vocabularies and Tensor names are unchanged. Raises: - RuntimeError: If eager execution is enabled. ValueError: parameters of `model_fn` don't match `params`. ValueError: if this is called via a subclass and if that class overrides a member of `Estimator`. """ - if context.executing_eagerly(): - raise RuntimeError( - 'Estimators are not supported when eager execution is enabled.') - Estimator._assert_members_are_not_overridden(self) if config is None: @@ -269,7 +260,8 @@ class Estimator(object): ValueError: If the Estimator has not produced a checkpoint yet. """ _check_checkpoint_available(self.model_dir) - return training.load_variable(self.model_dir, name) + with context.graph_mode(): + return training.load_variable(self.model_dir, name) def get_variable_names(self): """Returns list of all variable names in this model. @@ -281,7 +273,8 @@ class Estimator(object): ValueError: If the Estimator has not produced a checkpoint yet. """ _check_checkpoint_available(self.model_dir) - return [name for name, _ in training.list_variables(self.model_dir)] + with context.graph_mode(): + return [name for name, _ in training.list_variables(self.model_dir)] def latest_checkpoint(self): """Finds the filename of latest saved checkpoint file in `model_dir`. @@ -290,7 +283,8 @@ class Estimator(object): The full path to the latest checkpoint or `None` if no checkpoint was found. """ - return saver.latest_checkpoint(self.model_dir) + with context.graph_mode(): + return saver.latest_checkpoint(self.model_dir) def train(self, input_fn, @@ -342,27 +336,28 @@ class Estimator(object): ValueError: If both `steps` and `max_steps` are not `None`. ValueError: If either `steps` or `max_steps` is <= 0. """ - if (steps is not None) and (max_steps is not None): - raise ValueError('Can not provide both steps and max_steps.') - if steps is not None and steps <= 0: - raise ValueError('Must specify steps > 0, given: {}'.format(steps)) - if max_steps is not None and max_steps <= 0: - raise ValueError( - 'Must specify max_steps > 0, given: {}'.format(max_steps)) + with context.graph_mode(): + if (steps is not None) and (max_steps is not None): + raise ValueError('Can not provide both steps and max_steps.') + if steps is not None and steps <= 0: + raise ValueError('Must specify steps > 0, given: {}'.format(steps)) + if max_steps is not None and max_steps <= 0: + raise ValueError( + 'Must specify max_steps > 0, given: {}'.format(max_steps)) - if max_steps is not None: - start_step = _load_global_step_from_checkpoint_dir(self._model_dir) - if max_steps <= start_step: - logging.info('Skipping training since max_steps has already saved.') - return self + if max_steps is not None: + start_step = _load_global_step_from_checkpoint_dir(self._model_dir) + if max_steps <= start_step: + logging.info('Skipping training since max_steps has already saved.') + return self - hooks = _check_hooks_type(hooks) - hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps)) + hooks = _check_hooks_type(hooks) + hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps)) - saving_listeners = _check_listeners_type(saving_listeners) - loss = self._train_model(input_fn, hooks, saving_listeners) - logging.info('Loss for final step: %s.', loss) - return self + saving_listeners = _check_listeners_type(saving_listeners) + loss = self._train_model(input_fn, hooks, saving_listeners) + logging.info('Loss for final step: %s.', loss) + return self def _convert_train_steps_to_hooks(self, steps, max_steps): if steps is not None or max_steps is not None: @@ -415,14 +410,15 @@ class Estimator(object): ValueError: If no model has been trained, namely `model_dir`, or the given `checkpoint_path` is empty. """ - hooks = _check_hooks_type(hooks) - hooks.extend(self._convert_eval_steps_to_hooks(steps)) + with context.graph_mode(): + hooks = _check_hooks_type(hooks) + hooks.extend(self._convert_eval_steps_to_hooks(steps)) - return self._evaluate_model( - input_fn=input_fn, - hooks=hooks, - checkpoint_path=checkpoint_path, - name=name) + return self._evaluate_model( + input_fn=input_fn, + hooks=hooks, + checkpoint_path=checkpoint_path, + name=name) def _convert_eval_steps_to_hooks(self, steps): if steps is None: @@ -479,45 +475,48 @@ class Estimator(object): `predictions`. For example if `predict_keys` is not `None` but `EstimatorSpec.predictions` is not a `dict`. """ - hooks = _check_hooks_type(hooks) - # Check that model has been trained. - if not checkpoint_path: - checkpoint_path = saver.latest_checkpoint(self._model_dir) - if not checkpoint_path: - raise ValueError('Could not find trained model in model_dir: {}.'.format( - self._model_dir)) + with context.graph_mode(): + hooks = _check_hooks_type(hooks) + # Check that model has been trained. + if not checkpoint_path: + checkpoint_path = saver.latest_checkpoint(self._model_dir) + if not checkpoint_path: + raise ValueError( + 'Could not find trained model in model_dir: {}.'.format( + self._model_dir)) - with ops.Graph().as_default() as g: - random_seed.set_random_seed(self._config.tf_random_seed) - self._create_and_assert_global_step(g) - features, input_hooks = self._get_features_from_input_fn( - input_fn, model_fn_lib.ModeKeys.PREDICT) - estimator_spec = self._call_model_fn( - features, None, model_fn_lib.ModeKeys.PREDICT, self.config) - predictions = self._extract_keys(estimator_spec.predictions, predict_keys) - all_hooks = list(input_hooks) - all_hooks.extend(hooks) - all_hooks.extend(list(estimator_spec.prediction_hooks or [])) - with training.MonitoredSession( - session_creator=training.ChiefSessionCreator( - checkpoint_filename_with_path=checkpoint_path, - master=self._config.master, - scaffold=estimator_spec.scaffold, - config=self._session_config), - hooks=all_hooks) as mon_sess: - while not mon_sess.should_stop(): - preds_evaluated = mon_sess.run(predictions) - if not yield_single_examples: - yield preds_evaluated - elif not isinstance(predictions, dict): - for pred in preds_evaluated: - yield pred - else: - for i in range(self._extract_batch_length(preds_evaluated)): - yield { - key: value[i] - for key, value in six.iteritems(preds_evaluated) - } + with ops.Graph().as_default() as g: + random_seed.set_random_seed(self._config.tf_random_seed) + self._create_and_assert_global_step(g) + features, input_hooks = self._get_features_from_input_fn( + input_fn, model_fn_lib.ModeKeys.PREDICT) + estimator_spec = self._call_model_fn( + features, None, model_fn_lib.ModeKeys.PREDICT, self.config) + predictions = self._extract_keys( + estimator_spec.predictions, predict_keys) + all_hooks = list(input_hooks) + all_hooks.extend(hooks) + all_hooks.extend(list(estimator_spec.prediction_hooks or [])) + with training.MonitoredSession( + session_creator=training.ChiefSessionCreator( + checkpoint_filename_with_path=checkpoint_path, + master=self._config.master, + scaffold=estimator_spec.scaffold, + config=self._session_config), + hooks=all_hooks) as mon_sess: + while not mon_sess.should_stop(): + preds_evaluated = mon_sess.run(predictions) + if not yield_single_examples: + yield preds_evaluated + elif not isinstance(predictions, dict): + for pred in preds_evaluated: + yield pred + else: + for i in range(self._extract_batch_length(preds_evaluated)): + yield { + key: value[i] + for key, value in six.iteritems(preds_evaluated) + } def _assert_members_are_not_overridden(self): """Asserts members of `Estimator` are not overridden.""" @@ -597,73 +596,75 @@ class Estimator(object): are provided, or no checkpoint can be found. """ # pylint: enable=line-too-long - if serving_input_receiver_fn is None: - raise ValueError('serving_input_receiver_fn must be defined.') + with context.graph_mode(): + if serving_input_receiver_fn is None: + raise ValueError('serving_input_receiver_fn must be defined.') - with ops.Graph().as_default() as g: - self._create_and_assert_global_step(g) - random_seed.set_random_seed(self._config.tf_random_seed) - serving_input_receiver = serving_input_receiver_fn() + with ops.Graph().as_default() as g: + self._create_and_assert_global_step(g) + random_seed.set_random_seed(self._config.tf_random_seed) + serving_input_receiver = serving_input_receiver_fn() - # Call the model_fn and collect the export_outputs. - estimator_spec = self._call_model_fn( - features=serving_input_receiver.features, - labels=None, - mode=model_fn_lib.ModeKeys.PREDICT, - config=self.config) + # Call the model_fn and collect the export_outputs. + estimator_spec = self._call_model_fn( + features=serving_input_receiver.features, + labels=None, + mode=model_fn_lib.ModeKeys.PREDICT, + config=self.config) - # Build the SignatureDefs from receivers and all outputs - signature_def_map = build_all_signature_defs( - serving_input_receiver.receiver_tensors, - estimator_spec.export_outputs, - serving_input_receiver.receiver_tensors_alternatives) + # Build the SignatureDefs from receivers and all outputs + signature_def_map = build_all_signature_defs( + serving_input_receiver.receiver_tensors, + estimator_spec.export_outputs, + serving_input_receiver.receiver_tensors_alternatives) - if not checkpoint_path: - # Locate the latest checkpoint - checkpoint_path = saver.latest_checkpoint(self._model_dir) - if not checkpoint_path: - raise ValueError("Couldn't find trained model at %s." % self._model_dir) + if not checkpoint_path: + # Locate the latest checkpoint + checkpoint_path = saver.latest_checkpoint(self._model_dir) + if not checkpoint_path: + raise ValueError( + "Couldn't find trained model at %s." % self._model_dir) - export_dir = get_timestamped_export_dir(export_dir_base) - temp_export_dir = get_temp_export_dir(export_dir) + export_dir = get_timestamped_export_dir(export_dir_base) + temp_export_dir = get_temp_export_dir(export_dir) - # TODO(soergel): Consider whether MonitoredSession makes sense here - with tf_session.Session(config=self._session_config) as session: + # TODO(soergel): Consider whether MonitoredSession makes sense here + with tf_session.Session(config=self._session_config) as session: - saver_for_restore = estimator_spec.scaffold.saver or saver.Saver( - sharded=True) - saver_for_restore.restore(session, checkpoint_path) + saver_for_restore = estimator_spec.scaffold.saver or saver.Saver( + sharded=True) + saver_for_restore.restore(session, checkpoint_path) - # pylint: disable=protected-access - local_init_op = ( - estimator_spec.scaffold.local_init_op or - monitored_session.Scaffold._default_local_init_op()) - # pylint: enable=protected-access + # pylint: disable=protected-access + local_init_op = ( + estimator_spec.scaffold.local_init_op or + monitored_session.Scaffold._default_local_init_op()) + # pylint: enable=protected-access - # Perform the export - builder = saved_model_builder.SavedModelBuilder(temp_export_dir) - builder.add_meta_graph_and_variables( - session, [tag_constants.SERVING], - signature_def_map=signature_def_map, - assets_collection=ops.get_collection( - ops.GraphKeys.ASSET_FILEPATHS), - legacy_init_op=local_init_op, - strip_default_attrs=strip_default_attrs) - builder.save(as_text) + # Perform the export + builder = saved_model_builder.SavedModelBuilder(temp_export_dir) + builder.add_meta_graph_and_variables( + session, [tag_constants.SERVING], + signature_def_map=signature_def_map, + assets_collection=ops.get_collection( + ops.GraphKeys.ASSET_FILEPATHS), + legacy_init_op=local_init_op, + strip_default_attrs=strip_default_attrs) + builder.save(as_text) - # Add the extra assets - if assets_extra: - assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir), - compat.as_bytes('assets.extra')) - for dest_relative, source in assets_extra.items(): - dest_absolute = os.path.join(compat.as_bytes(assets_extra_path), - compat.as_bytes(dest_relative)) - dest_path = os.path.dirname(dest_absolute) - gfile.MakeDirs(dest_path) - gfile.Copy(source, dest_absolute) + # Add the extra assets + if assets_extra: + assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir), + compat.as_bytes('assets.extra')) + for dest_relative, source in assets_extra.items(): + dest_absolute = os.path.join(compat.as_bytes(assets_extra_path), + compat.as_bytes(dest_relative)) + dest_path = os.path.dirname(dest_absolute) + gfile.MakeDirs(dest_path) + gfile.Copy(source, dest_absolute) - gfile.Rename(temp_export_dir, export_dir) - return export_dir + gfile.Rename(temp_export_dir, export_dir) + return export_dir def _get_features_from_input_fn(self, input_fn, mode): """Extracts the `features` from return values of `input_fn`.""" diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py index f4255091bf6..d453e19357a 100644 --- a/tensorflow/python/estimator/estimator_test.py +++ b/tensorflow/python/estimator/estimator_test.py @@ -2287,6 +2287,7 @@ class EstimatorHookOrderingTest(test.TestCase): class EstimatorIntegrationTest(test.TestCase): + @test_util.run_in_graph_and_eager_modes() def test_complete_flow_with_a_simple_linear_model(self): def _model_fn(features, labels, mode): From 2591a66ab804b73f55c1c7a0b105744f94d8a02e Mon Sep 17 00:00:00 2001 From: Russell Power Date: Fri, 20 Apr 2018 17:55:01 -0700 Subject: [PATCH 0561/1734] Automated g4 rollback of changelist 193717076 PiperOrigin-RevId: 193749007 --- tensorflow/contrib/tpu/BUILD | 1 + .../contrib/tpu/python/tpu/keras_support.py | 391 ++++++++++++++++++ 2 files changed, 392 insertions(+) create mode 100644 tensorflow/contrib/tpu/python/tpu/keras_support.py diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 9646d15486e..eac210418b5 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -162,6 +162,7 @@ py_library( "python/tpu/__init__.py", "python/tpu/bfloat16.py", "python/tpu/device_assignment.py", + "python/tpu/keras_support.py", "python/tpu/topology.py", "python/tpu/tpu.py", "python/tpu/tpu_feed.py", diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py new file mode 100644 index 00000000000..e86ca0a1d8f --- /dev/null +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -0,0 +1,391 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""*Experimental* support for running Keras models on the TPU. + +To use, wrap your model with the `keras_support.tpu_model` function. + +Example usage: + +``` +# Must activate before building TPU models +keras_support.setup_tpu_session(master_address) + +image = tf.keras.layers.Input(shape=(28, 28, 3), name='image') +c1 = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3))( image) +flattened = tf.keras.layers.Flatten()(c1) +logits = tf.keras.layers.Dense(10, activation='softmax')(flattened) +model = tf.keras.Model(inputs=[image], outputs=[logits]) +model = keras_support.tpu_model(model) + +# Only TF optimizers are currently supported. +model.compile(optimizer=tf.train.AdamOptimizer(), ...) + +# `images` and `labels` should be Numpy arrays. Support for tensor input +# (e.g. datasets) is planned. +model.fit(images, labels) + +# Invoke before shutting down +keras_support.shutdown_tpu_session() +``` +""" + +# pylint: disable=protected-access + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re + +from tensorflow.contrib.framework.python.framework import experimental +from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session as tf_session +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_spec +from tensorflow.python.keras._impl.keras import backend as K +from tensorflow.python.keras._impl.keras import layers +from tensorflow.python.keras._impl.keras import models +from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers +from tensorflow.python.keras._impl.keras.layers import embeddings +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import training_util + + +class TPUEmbedding(embeddings.Embedding): + """TPU compatible embedding layer. + + The default Keras layer is not TPU compatible. This layer is a drop-in + replacement: it has the same behavior and will work on CPU and GPU devices. + """ + + def __init__(self, *args, **kw): + super(TPUEmbedding, self).__init__(*args, **kw) + + def build(self, input_shape): + if input_shape[0] is None: + raise ValueError( + 'TPUEmbeddings must have a fixed input_length or input shape.') + return super(TPUEmbedding, self).build(input_shape) + + def call(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = math_ops.cast(inputs, 'int32') + + inputs = array_ops.one_hot(inputs, self.input_dim) + return math_ops.tensordot(inputs, self.embeddings, 1) + + +class CompiledTPUOp( + collections.namedtuple( + 'CompiledTPUOp', + ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])): + pass + + +def _valid_name(tensor_name): + """Return a valid tensor name (strips '/', ':', etc).""" + return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name) + + +class TPUFunction(object): + """K.function compatible interface for invoking a TPU compiled function. + + Recompilation is triggered on-demand for each set of new inputs shapes: the + results are cached for future execution. We expect most computations will + be dominated by a standard batch-size, followed by a straggler batch for + the end of training or evaluation. + + All `inputs` and `outputs` will be loaded via the infeed and outfeed queues + instead of being injected as `feed_dict` items or fetches. + """ + + def __init__(self, model, execution_mode): + self.model = model + self.execution_mode = execution_mode + self._compilation_cache = {} + + def _specialize_model(self, input_specs): + """Specialize `self.model` (a Keras model) for the given input shapes.""" + # Re-create our input and output layers inside our subgraph. They will be + # attached to the true computation when we clone our model in `tpu_fn`. + K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN) + + # functools.partial and callable objects are not supported by tpu.rewrite + def _model_fn(): + """Compute fit/eval/predict for the TPU.""" + is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN + is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL + is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT + + # During train/eval, we infeed our features as well as labels. + if is_training or is_test: + infeed_layers = self.model._input_layers + self.model._output_layers + else: + infeed_layers = self.model._input_layers + + # Generate our infeed operation to read features & labels. + infeed_tensors = tpu_ops.infeed_dequeue_tuple( + dtypes=[spec.dtype for spec in input_specs], + shapes=[spec.shape for spec in input_specs], + name='infeed-%s' % self.execution_mode) + + assert len(infeed_tensors) == len(infeed_layers), ( + 'Infeed inputs did not match model: %s vs %s', (infeed_layers, + infeed_tensors)) + + tpu_targets = [] + tpu_inputs = [] + + # Sort infeed outputs into inputs and labels for calling our Keras model. + for tensor, layer in zip(infeed_tensors, infeed_layers): + if layer in self.model._input_layers: + tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor)) + if layer in self.model._output_layers: + tpu_targets.append(tensor) + + optimizer = self.model.optimizer + optimizer.iterations = training_util.get_or_create_global_step() + + # Call our model with our infeed inputs (re-using the weights). + model_outputs = self.model(tpu_inputs) + child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) + if is_training or is_test: + child_model.compile( + optimizer=self.model.optimizer, + loss=self.model.loss, + loss_weights=self.model.loss_weights, + metrics=self.model.metrics, + weighted_metrics=self.model.weighted_metrics, + target_tensors=tpu_targets, + ) + + # Compute our outfeed depending on the execution mode + if is_training: + child_model._make_train_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.train_function.outputs + ] + return [ + child_model.train_function.updates_op, + tpu_ops.outfeed_enqueue_tuple( + child_model.train_function.outputs, name='oufeed-enqueue-train') + ] + elif is_test: + child_model._make_test_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.test_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.test_function.outputs, name='outfeed-enqueue-test') + ] + elif is_predict: + child_model._make_predict_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.predict_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.predict_function.outputs, + name='outfeed-enqueue-predict', + ) + ] + else: + assert False, 'Unexpected execution mode: %s' % self.execution_mode + + # Capture outfeed metadata computed during the rewrite. + self._outfeed_spec = None + + tpu_execute_op = tpu.rewrite(_model_fn) + + K._initialize_variables(K.get_session()) # pylint-disable: protected-access + + # Generate CPU side operations to enqueue features/labels and dequeue + # outputs from the model call. + with ops.device('/device:TPU:0'): + infeed_tensors = [] + for spec in input_specs: + infeed_tensors.append( + array_ops.placeholder( + dtype=spec.dtype, + shape=spec.shape, + name='infeed-enqueue-%s' % spec.name)) + + infeed_op = tpu_ops.infeed_enqueue_tuple( + infeed_tensors, [spec.shape for spec in input_specs], + name='infeed-enqueue-%s' % self.execution_mode) + + outfeed_op = tpu_ops.outfeed_dequeue_tuple( + dtypes=[spec.dtype for spec in self._outfeed_spec], + shapes=[spec.shape for spec in self._outfeed_spec], + name='outfeed-dequeue-%s' % self.execution_mode) + + return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op) + + def __call__(self, inputs): + assert isinstance(inputs, list) + + # Strip sample weight from inputs + if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or + self.execution_mode == model_fn_lib.ModeKeys.EVAL): + input_tensors = self.model._feed_inputs + self.model._feed_targets + inputs = inputs[:len(input_tensors)] + else: + input_tensors = self.model._feed_inputs + + # Compute an input specification (used to generate infeed enqueue and + # dequeue operations). We use the shape from our input array and the + # dtype from our model. A user may pass in a float64 for a float32 + # input: for model compatibility we still must generate a float32 infeed. + input_specs = [] + for tensor, ary in zip(input_tensors, inputs): + input_specs.append( + tensor_spec.TensorSpec(ary.shape, tensor.dtype, + _valid_name(tensor.name))) + + # XLA requires every operation in the graph has a fixed shape. To + # handle varying batch sizes we recompile a new sub-graph for each + # unique input shape. + shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs]) + + if shape_key not in self._compilation_cache: + logging.info('New input shapes; (re-)compiling: mode=%s, %s', + self.execution_mode, input_specs) + self._compilation_cache[shape_key] = self._specialize_model(input_specs) + + compiled_model = self._compilation_cache[shape_key] + + infeed_dict = {} + for tensor, value in zip(compiled_model.infeed_tensors, inputs): + infeed_dict[tensor] = value + + session = K.get_session() + _, _, outfeed_outputs = session.run([ + compiled_model.infeed_op, compiled_model.tpu_execute_op, + compiled_model.outfeed_op + ], infeed_dict) + + return outfeed_outputs + + +@experimental +def setup_tpu_session(master): + """Initializes and returns a Keras/TF session connected the TPU `master`.""" + session = tf_session.Session( + target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) + K.set_session(session) + K.get_session().run(tpu.initialize_system()) + K.manual_variable_initialization(True) + return session + + +@experimental +def shutdown_tpu_session(session=None): + """Shutdown the TPU attached to session. + + This should be called to cleanly shut down the TPU system before the client + exits. + + Args: + session: Session to shutdown, or None to use the default session. + + Returns: + + """ + if session is None: + session = K.get_session() + + session.run(tpu.shutdown_system()) + + +class KerasTPUModel(models.Model): + """TPU compatible Keras model wrapper.""" + + def __init__(self, inputs, outputs, name=None): + super(models.Model, self).__init__( + inputs=inputs, + outputs=outputs, + name=name, + ) + self.predict_function = None + self.test_function = None + self.train_function = None + + def compile(self, + optimizer, + loss=None, + metrics=None, + loss_weights=None, + sample_weight_mode=None, + weighted_metrics=None, + target_tensors=None, + **kwargs): + if sample_weight_mode: + raise ValueError('sample_weight_mode not supported for TPU execution.') + if weighted_metrics: + raise ValueError('weighted_metrics not supported for TPU execution.') + if target_tensors: + raise ValueError('target_tensors is not supported for TPU execution.') + + super(KerasTPUModel, self).compile(optimizer, loss, metrics, loss_weights, + sample_weight_mode, weighted_metrics, + target_tensors, **kwargs) + + # Keras optimizers are not compatible with TPU rewrite + if not isinstance(self.optimizer, keras_optimizers.TFOptimizer): + raise ValueError( + 'Optimizer must be a TFOptimizer, got: %s' % self.optimizer) + + def train_on_batch(self, x, y, sample_weight=None, class_weight=None): + return super(KerasTPUModel, self).train_on_batch(x, y, sample_weight, + class_weight) + + def _make_train_function(self): + if not self.train_function: + self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN) + + return self.train_function + + def _make_test_function(self): + if not self.test_function: + self.test_function = TPUFunction(self, model_fn_lib.ModeKeys.EVAL) + return self.test_function + + def _make_predict_function(self): + if not self.predict_function: + self.predict_function = TPUFunction(self, model_fn_lib.ModeKeys.PREDICT) + return self.predict_function + + def cpu_model(self): + return models.Model( + inputs=self.inputs, + outputs=self.outputs, + name=self.name, + ) + + +@experimental +def tpu_model(model): + return KerasTPUModel( + inputs=model.inputs, outputs=model.outputs, name=model.name) From 7cf9b65492121961f98481fa06a0398698c6c0a3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 18:29:01 -0700 Subject: [PATCH 0562/1734] Automated g4 rollback of changelist 193605910 PiperOrigin-RevId: 193751624 --- tensorflow/core/grappler/optimizers/BUILD | 4 - .../grappler/optimizers/function_optimizer.cc | 126 +------ .../grappler/optimizers/function_optimizer.h | 6 +- .../optimizers/function_optimizer_test.cc | 32 +- .../grappler/optimizers/meta_optimizer.cc | 334 +++++++----------- .../core/grappler/optimizers/meta_optimizer.h | 33 +- .../optimizers/meta_optimizer_test.cc | 172 +-------- tensorflow/core/grappler/utils/functions.cc | 12 +- tensorflow/core/grappler/utils/functions.h | 40 +-- .../core/grappler/utils/functions_test.cc | 8 +- 10 files changed, 198 insertions(+), 569 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 42c3580d40f..3f573cda101 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -517,13 +517,11 @@ cc_library( ":loop_optimizer", ":memory_optimizer", ":model_pruner", - "//tensorflow/core:core_cpu_base", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler/utils:colocation", - "//tensorflow/core/grappler/utils:functions", "//tensorflow/core/grappler/utils:topological_sort", ], ) @@ -540,11 +538,9 @@ tf_cuda_cc_test( "//tensorflow/core:tensorflow", "//tensorflow/core:test", "//tensorflow/core:test_main", - "//tensorflow/core:testlib", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder", - "//tensorflow/core/grappler/utils:grappler_test", ], ) diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index 950933b9335..d008a9719fe 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def.pb.h" -#include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op_def.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/graph/graph_constructor.h" @@ -76,10 +75,12 @@ string UniqueSpecializedFunctionName(const FunctionDef& func, class FunctionOptimizerContext { public: - explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level, - const GrapplerItem& item) - : function_library_(OpRegistry::Global(), item.graph.library()) { - InitializeInlinedFunctions(opt_level, item); + explicit FunctionOptimizerContext(const GrapplerItem& item, + RewriterConfig::Toggle opt_level) + : opt_level_(opt_level), + function_library_(FunctionLibraryDefinition(OpRegistry::Global(), + item.graph.library())) { + InitializeInlinedFunctions(item); } const FunctionLibraryDefinition& function_library() const { @@ -100,9 +101,8 @@ class FunctionOptimizerContext { } private: - void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level, - const GrapplerItem& item) { - bool aggressive = opt_level == RewriterConfig::AGGRESSIVE; + void InitializeInlinedFunctions(const GrapplerItem& item) { + bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE; for (const FunctionDef& func : item.graph.library().function()) { // Can't create IdentityN nodes with no input or output: skip these @@ -120,6 +120,7 @@ class FunctionOptimizerContext { } } + RewriterConfig::Toggle opt_level_; FunctionLibraryDefinition function_library_; // Functions that can be inlined into optimized graph. std::unordered_map inlined_functions_; @@ -127,93 +128,9 @@ class FunctionOptimizerContext { TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext); }; -// Return trimmed FunctionDefLibrary with functions that are reachable from -// the optimized graph. -FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib, - const GraphDef& optimized_graph) { - // Functions that are reachable from the optimized graph. - std::unordered_set keep_funcs; - - std::vector func_queue; - func_queue.reserve(flib.num_functions()); - - // Add registered and not already processed functions to the queue by name. - const auto add_to_func_queue = [&](const string& func_name) { - const FunctionDef* func = flib.Find(func_name); - if (func && keep_funcs.find(func_name) == keep_funcs.end()) { - func_queue.push_back(func); - } - }; - - // Find all the functions that are reachable from the given node. - const auto add_node_to_func_queue = [&](const NodeDef& node) { - // Node itself can be a call to the function. - add_to_func_queue(node.op()); - - // Or node can have an attribute referencing a function. - for (const auto& attr : node.attr()) { - const auto& attr_value = attr.second; - - // 1. AttrValue.func - if (attr_value.has_func()) { - add_to_func_queue(attr_value.func().name()); - } - - // 2. AttrValue.ListValue.func - if (attr_value.has_list()) { - for (const auto& func : attr_value.list().func()) { - add_to_func_queue(func.name()); - } - } - } - }; - - // Add all functions that are directly called from the optimized graph. - const auto& graph_nodes = optimized_graph.node(); - std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue); - - // Process all reachable functions. - while (!func_queue.empty()) { - const FunctionDef* func = func_queue.back(); - func_queue.pop_back(); - - const string& func_name = func->signature().name(); - keep_funcs.insert(func_name); - - // Find all the functions that called from the function body. - const auto& func_body = func->node_def(); - std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue); - - // Check if the function has a registered gradient. - const string grad_func_name = flib.FindGradient(func_name); - if (!grad_func_name.empty()) add_to_func_queue(grad_func_name); - } - - FunctionDefLibrary lib; - for (const string& func_name : keep_funcs) { - const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name)); - *lib.add_function() = *func; - - const string grad_func_name = flib.FindGradient(func_name); - if (!grad_func_name.empty()) { - GradientDef* gd = lib.add_gradient(); - gd->set_function_name(func_name); - gd->set_gradient_func(grad_func_name); - } - } - - VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions (" - << static_cast(keep_funcs.size() - flib.num_functions()) << ")"; - - return lib; -} - Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, FunctionOptimizerContext* ctx, GraphDef* optimized_graph) { - VLOG(2) << "Specialize function instantiation: " - << SummarizeNodeDef(func_node); - const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -224,20 +141,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); // TODO(ezhulenev): Push down const inputs and known input shapes. - FunctionDef specialized_func; - TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func)); + FunctionDef specialized; + TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized)); // Find a name for specialized function. const string specialized_func_name = UniqueSpecializedFunctionName(func, func_node, flib); - specialized_func.mutable_signature()->set_name(specialized_func_name); - auto* specialized_attr = specialized_func.mutable_attr(); + specialized.mutable_signature()->set_name(specialized_func_name); + auto* specialized_attr = specialized.mutable_attr(); (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true); // Add specialized function to the library. TF_RETURN_IF_ERROR( - ctx->mutable_function_library().AddFunctionDef(specialized_func)); + ctx->mutable_function_library().AddFunctionDef(specialized)); // Add a function call node for the specialized function. NodeDef* specialized_func_node = optimized_graph->add_node(); @@ -309,8 +226,6 @@ Status HookInlinedFunctionOutputs( Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionOptimizerContext& ctx, GraphDef* optimized_graph) { - VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node); - const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -444,8 +359,6 @@ class SymbolicGradientEnv { Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, GraphDef* inlined_graph) { - VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node); - GraphDef graph_def; // Create a node to anchor the gradient inputs @@ -541,16 +454,13 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, GraphDef* optimized_graph) { - VLOG(2) << "Optimize function library: id=" << item.id; - // Nothing to do here. if (item.graph.library().function_size() == 0) { - VLOG(3) << "Skip Grappler item with empty function library"; *optimized_graph = item.graph; return Status::OK(); } - FunctionOptimizerContext ctx(opt_level_, item); + FunctionOptimizerContext ctx(item, opt_level_); SymbolicGradientEnv env(item.graph.versions().producer(), item.graph.library()); @@ -596,11 +506,9 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph->add_node() = node; } + // TODO(bsteiner): trim the library to remove unused function definitions *optimized_graph->mutable_versions() = item.graph.versions(); - *optimized_graph->mutable_library() = - options_.enable_trim_function_library - ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph) - : ctx.function_library().ToProto(); + *optimized_graph->mutable_library() = ctx.function_library().ToProto(); return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h index e307b4e533f..c555fadf83a 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.h +++ b/tensorflow/core/grappler/optimizers/function_optimizer.h @@ -26,9 +26,8 @@ namespace grappler { // operations to make the overall graph more efficient. class FunctionOptimizer : public GraphOptimizer { public: - explicit FunctionOptimizer(RewriterConfig::Toggle opt_level) - : opt_level_(opt_level) {} - ~FunctionOptimizer() override = default; + FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {} + ~FunctionOptimizer() override {} string name() const override { return "function_optimizer"; }; @@ -45,7 +44,6 @@ class FunctionOptimizer : public GraphOptimizer { bool enable_function_inlining = true; bool enable_function_specialization = true; bool enable_symbolic_gradient_inlining = true; - bool enable_trim_function_library = true; }; RewriterConfig::Toggle opt_level_; diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc index 6147e8a27c0..fb006d48688 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc @@ -31,8 +31,20 @@ constexpr char kDevice[] = "/device:CPU:0"; class FunctionOptimizerTest : public GrapplerTest { protected: - void DisableFunctionSpecialization(FunctionOptimizer* optimizer) { + void DisableAll(FunctionOptimizer* optimizer) { + optimizer->options_.enable_function_inlining = false; optimizer->options_.enable_function_specialization = false; + optimizer->options_.enable_symbolic_gradient_inlining = false; + } + + void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) { + DisableAll(optimizer); + optimizer->options_.enable_function_inlining = true; + } + + void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) { + DisableAll(optimizer); + optimizer->options_.enable_function_specialization = true; } }; @@ -340,7 +352,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); - DisableFunctionSpecialization(&optimizer); // do not specialize noinline func + EnableOnlyFunctionInlining(&optimizer); const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( @@ -614,13 +626,14 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); + EnableOnlyFunctionSpecialization(&optimizer); - // Mark XTimesTwo as noinline. + // Mark XTimesTwo as noinline FunctionDef x_times_two = test::function::XTimesTwo(); (*x_times_two.mutable_attr())["_noinline"].set_b(true); std::vector function_library = {x_times_two}; - // Build a graph to compute y = XTimesTwo(x). + // Build a graph to compute y = XTimesTwo(x) GrapplerItem item; item.graph = test::function::GDef( {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice), @@ -631,13 +644,12 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { GraphDef output; TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output)); - // Make sure that specialized function was added to the library and original - // function was removed. - EXPECT_EQ(1, output.library().function_size()); + // Make sure that specialized function was added to the library + EXPECT_EQ(2, output.library().function_size()); EXPECT_EQ("XTimesTwo_specialized_for_y", - output.library().function(0).signature().name()); + output.library().function(1).signature().name()); - // And 'y' node is calling specialized function. + // And 'y' node is calling specialized function int count = 0; for (const NodeDef& node : output.node()) { if (node.name() == "y" && count++) { @@ -646,7 +658,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { } EXPECT_EQ(1, count); - // And that graph evaluation yields the same result. + // And that graph evaluation yields the same result Tensor pi = test::AsScalar(3.14f); item.fetch = {"z"}; item.feed.emplace_back("x", pi); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index cdc4698c345..558b8a77e8a 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" -#include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h" @@ -30,7 +29,6 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/memory_optimizer.h" #include "tensorflow/core/grappler/optimizers/model_pruner.h" #include "tensorflow/core/grappler/utils/colocation.h" -#include "tensorflow/core/grappler/utils/functions.h" #include "tensorflow/core/grappler/utils/topological_sort.h" #include "tensorflow/core/lib/core/status.h" @@ -38,9 +36,6 @@ namespace tensorflow { namespace grappler { namespace { - -constexpr int kDefaultNumberOfIterations = 1; - int64 NumEdges(const GraphDef& graph) { int64 num_edges = 0; for (const auto& node : graph.node()) { @@ -55,138 +50,144 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) { NumEdges(after), " edges (", NumEdges(after) - NumEdges(before), ")"); } - -int NumIterations(const RewriterConfig& cfg) { - return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS - ? kDefaultNumberOfIterations - : cfg.meta_optimizer_iterations(); -} - -// Check if optimizer is allowed to run only once. -int IsRunOnceOptimizer(const string& name) { return name == "layout"; } - } // namespace -std::unique_ptr MetaOptimizer::MakeNewOptimizer( - const string& optimizer) const { -#define MK_OPT(NAME, VALUE) \ - if (optimizer == NAME) return std::unique_ptr(VALUE) - - MK_OPT("pruning", new ModelPruner()); - MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization())); - MK_OPT("constfold", new ConstantFolding(cpu_device_)); - MK_OPT("layout", new LayoutOptimizer()); - MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL)); - MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization())); - MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas())); - MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization())); - MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization())); - MK_OPT("debug_stripper", new DebugStripper()); - - return std::unique_ptr(); -#undef MK_OPT -} - -Status MetaOptimizer::InitializeOptimizers( - std::vector>* optimizers) const { - if (!cfg_.disable_model_pruning()) { - optimizers->emplace_back(new ModelPruner()); +std::unique_ptr MetaOptimizer::NewOptimizer( + const string& optimizer) { + std::unique_ptr graph_optimizer; + if (optimizer == "pruning") { + graph_optimizer.reset(new ModelPruner()); } - if (cfg_.function_optimization() != RewriterConfig::OFF) { - optimizers->emplace_back( - new FunctionOptimizer(cfg_.function_optimization())); + if (optimizer == "function") { + graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization())); } - if (cfg_.debug_stripper() == RewriterConfig::ON) { - optimizers->emplace_back(new DebugStripper()); + if (optimizer == "constfold") { + graph_optimizer.reset(new ConstantFolding(cpu_device_)); } - if (cfg_.constant_folding() != RewriterConfig::OFF) { - optimizers->emplace_back( - new ConstantFolding(cfg_.constant_folding(), cpu_device_)); + if (optimizer == "layout") { + graph_optimizer.reset(new LayoutOptimizer()); } - if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { - optimizers->emplace_back( + if (optimizer == "memory") { + graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL)); + } + if (optimizer == "arithmetic") { + graph_optimizer.reset( new ArithmeticOptimizer(cfg_.arithmetic_optimization())); } - if (cfg_.loop_optimization() != RewriterConfig::OFF) { - optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization())); - } - if (cfg_.dependency_optimization() != RewriterConfig::OFF) { - optimizers->emplace_back( - new DependencyOptimizer(cfg_.dependency_optimization())); - } - if (cfg_.layout_optimizer() != RewriterConfig::OFF) { - optimizers->emplace_back(new LayoutOptimizer()); - } - if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { - if (cfg_.memory_optimizer_target_node_name_scope().empty()) { - optimizers->emplace_back( - // Use the default target node name prefix "gradients/" - new MemoryOptimizer(cfg_.memory_optimization())); - } else { - optimizers->emplace_back( - new MemoryOptimizer(cfg_.memory_optimization(), - cfg_.memory_optimizer_target_node_name_scope())); - } - } - if (cfg_.auto_parallel().enable()) { - optimizers->emplace_back( + if (optimizer == "autoparallel") { + graph_optimizer.reset( new AutoParallel(cfg_.auto_parallel().num_replicas())); } - return Status::OK(); + if (optimizer == "loop") { + graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization())); + } + if (optimizer == "dependency") { + graph_optimizer.reset( + new DependencyOptimizer(cfg_.dependency_optimization())); + } + if (optimizer == "debug_stripper") { + graph_optimizer.reset(new DebugStripper()); + } + return graph_optimizer; } -Status MetaOptimizer::InitializeOptimizersByName( - std::vector>* optimizers) const { - for (const string& optimizer_name : cfg_.optimizers()) { - auto optimizer = MakeNewOptimizer(optimizer_name); - if (optimizer) { - VLOG(2) << "Registered default graph optimizer: " << optimizer_name; - optimizers->push_back(std::move(optimizer)); - continue; +Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph) { + std::vector> optimizers; + if (cfg_.optimizers().empty()) { + if (!cfg_.disable_model_pruning()) { + optimizers.push_back(std::unique_ptr(new ModelPruner())); } - - auto custom_optimizer = - CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); - - if (custom_optimizer) { - VLOG(2) << "Registered custom graph optimizer: " << optimizer_name; - TF_RETURN_IF_ERROR(custom_optimizer->Init()); - optimizers->push_back(std::move(custom_optimizer)); - } else { - VLOG(2) << "Can't register an optimizer by name: " << optimizer_name; + if (cfg_.function_optimization() != RewriterConfig::OFF) { + optimizers.push_back(std::unique_ptr( + new FunctionOptimizer(cfg_.function_optimization()))); + } + if (cfg_.debug_stripper() == RewriterConfig::ON) { + optimizers.push_back( + std::unique_ptr(new DebugStripper())); + } + if (cfg_.constant_folding() != RewriterConfig::OFF) { + optimizers.push_back(std::unique_ptr( + new ConstantFolding(cfg_.constant_folding(), cpu_device_))); + } + if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { + optimizers.push_back(std::unique_ptr( + new ArithmeticOptimizer(cfg_.arithmetic_optimization()))); + } + if (cfg_.loop_optimization() != RewriterConfig::OFF) { + optimizers.push_back(std::unique_ptr( + new LoopOptimizer(cfg_.loop_optimization()))); + } + if (cfg_.dependency_optimization() != RewriterConfig::OFF) { + optimizers.push_back(std::unique_ptr( + new DependencyOptimizer(cfg_.dependency_optimization()))); + } + if (cfg_.layout_optimizer() != RewriterConfig::OFF) { + optimizers.push_back( + std::unique_ptr(new LayoutOptimizer())); + } + if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { + if (cfg_.memory_optimizer_target_node_name_scope().empty()) { + optimizers.push_back(std::unique_ptr( + // Use the default target node name prefix "gradients/" + new MemoryOptimizer(cfg_.memory_optimization()))); + } else { + optimizers.push_back( + std::unique_ptr(new MemoryOptimizer( + cfg_.memory_optimization(), + cfg_.memory_optimizer_target_node_name_scope()))); + } + } + if (cfg_.auto_parallel().enable()) { + optimizers.push_back(std::unique_ptr( + new AutoParallel(cfg_.auto_parallel().num_replicas()))); + } + } else { + const std::set available_optimizers = { + "pruning", "function", "constfold", "layout", + "memory", "autoparallel", "arithmetic", "loop", + "dependency", "debug_stripper"}; + std::vector custom_optimizer_names; + for (const auto& optimizer_name : cfg_.optimizers()) { + if (available_optimizers.find(optimizer_name) != + available_optimizers.end()) { + optimizers.push_back(NewOptimizer(optimizer_name)); + } else { + custom_optimizer_names.push_back(optimizer_name); + } + } + // Now run the custom optimizers. + for (const auto& optimizer_name : custom_optimizer_names) { + std::unique_ptr opt = + CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); + if (opt == nullptr) continue; + TF_RETURN_IF_ERROR(opt->Init()); + optimizers.push_back(std::move(opt)); } } - return Status::OK(); -} - -Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, - GraphDef* optimized_graph) { - VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id; - - std::vector> optimizers; - bool register_by_name = !cfg_.optimizers().empty(); - TF_RETURN_IF_ERROR(register_by_name ? InitializeOptimizersByName(&optimizers) - : InitializeOptimizers(&optimizers)); if (optimizers.empty()) { *optimized_graph = item.graph; return Status::OK(); } - // Invariant: optimized_graph contains the most recently optimized version of - // the graph. + // Some optimizers should be run only once. + const std::set run_once_optimizers = {"layout"}; + bool already_optimized = false; + const int num_iterations = + cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS + ? 1 + : cfg_.meta_optimizer_iterations(); GrapplerItem optimized_item = item; optimized_graph->Swap(&optimized_item.graph); - - GraphOptimizationResult optimization_result(item.id); - - for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) { - VLOG(4) << "Starting optimization iteration " << iteration + 1; - + for (int iteration = 0; iteration < num_iterations; ++iteration) { + VLOG(1) << "Starting optimization iteration " << iteration + 1; for (const auto& optimizer : optimizers) { - // Some optimizers can run only once. - if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue; - + // Invariant: optimized_graph contains the most recently optimized + // version of the graph. + if (iteration > 0 && run_once_optimizers.count(optimizer->name())) { + continue; + } uint64 start_us = Env::Default()->NowMicros(); // This swaps the current optimized_graph into optimized item and // resets optimized_graph to an empty graph. @@ -194,118 +195,45 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, *optimized_graph = GraphDef(); Status status = optimizer->Optimize(cluster, optimized_item, optimized_graph); - uint64 end_us = Env::Default()->NowMicros(); + uint64 end_us = Env::Default()->NowMicros(); + float duration_ms = (end_us - start_us) / 1000.0f; string result; if (!status.ok()) { + VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": " + << status.ToString(); optimized_graph->Swap(&optimized_item.graph); result = status.ToString(); } else { - optimization_result.is_optimized = true; - float duration_ms = (end_us - start_us) / 1000.0f; + already_optimized = true; result = strings::StrCat( + optimizer->name(), ": ", PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph), ", time = ", duration_ms, "ms."); } - VLOG(4) << optimizer->name() << ": " << result; - - OptimizerResult optimizer_result{optimizer->name(), result}; - optimization_result.results.push_back(optimizer_result); + result_.emplace_back(optimizer->name(), result); + VLOG(1) << result; } } - // Record graph optimization result. - optimization_results_.push_back(optimization_result); - - if (optimization_result.is_optimized) { + if (already_optimized) { TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph)); ReassignColocation(optimized_graph); - // Make sure that the optimizers preserved the graph version. + // Make sure that the optimizers preserved the graph version and library. + DCHECK_GE(optimized_graph->library().function_size(), + item.graph.library().function_size()); + DCHECK_GE(optimized_graph->library().gradient_size(), + item.graph.library().gradient_size()); DCHECK_EQ(optimized_graph->versions().producer(), item.graph.versions().producer()); } - - return Status::OK(); -} - -Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, - GraphDef* optimized_graph) { - optimization_results_.clear(); - - // 1. Optimize main graph - TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph)); - - // 2. Optimize function library - FunctionLibraryDefinition flib(OpRegistry::Global(), - optimized_graph->library()); - - // Optimize each function only once. - std::unordered_set optimized_funcs; - bool optimize_function_library = true; - - // TODO(ezhulenev): turn it on after fixing ranklab: tune_tf_test. - cfg_.set_constant_folding(RewriterConfig::OFF); - cfg_.set_arithmetic_optimization(RewriterConfig::OFF); - - while (optimize_function_library) { - optimize_function_library = false; - - for (const FunctionDef& func : optimized_graph->library().function()) { - const string& func_name = func.signature().name(); - - // Skip already optimized functions. - if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue; - - // Skip parametrized functions (function type or body is defined only at - // function call time by caller node attributes). - if (IsParametrized(func)) continue; - - VLOG(3) << "Optimize function: function=" << func_name; - - // Function optimization might specialize nested function calls, so we - // have to reset the flag and do at least one more pass over the library. - optimize_function_library = true; - optimized_funcs.insert(func_name); - - // Make a GrapplerItem from a FunctionDef. - GrapplerFunctionItem func_item; - TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item)); - - // Optimize function body graph. - GraphDef optimized_func_graph; - TF_RETURN_IF_ERROR( - OptimizeGraph(cluster, func_item, &optimized_func_graph)); - - // Function body optimization might have created new specialized - // functions, add them to the library. - TF_RETURN_IF_ERROR(flib.AddLibrary(optimized_func_graph.library())); - - // Convert optimized graph back to FunctionDef. - FunctionDef optimized_func; - func_item.SwapFunctionBody(std::move(optimized_func_graph)); - TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func)); - - // Replace optimized function with a new FunctionDef. - TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name)); - TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func)); - } - - // If optimized at least one function, update the graph library. - if (optimize_function_library) { - *optimized_graph->mutable_library() = flib.ToProto(); - } - } - return Status::OK(); } void MetaOptimizer::PrintResult() { - for (const GraphOptimizationResult& graph_result : optimization_results_) { - LOG(INFO) << "Optimization results for grappler item: " << graph_result.id; - for (const OptimizerResult& result : graph_result.results) { - LOG(INFO) << "Return status of optimizer " << result.optimizer_name - << ": " << result.result; - } + for (const auto& result : result_) { + LOG(INFO) << "Return status of optimizer " << result.first << ": " + << result.second; } } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 7cf9a40c2d6..382cfe51d42 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer { public: MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg) : cpu_device_(cpu_device), cfg_(cfg) {} - ~MetaOptimizer() override = default; + ~MetaOptimizer() override {} string name() const override { return "meta_optimizer"; }; @@ -43,37 +43,10 @@ class MetaOptimizer : public GraphOptimizer { const GraphDef& optimized_graph, double result) override; private: - std::unique_ptr MakeNewOptimizer( - const string& optimizer) const; - - // Initialize active optimizers from RewriterConfig toggles. - Status InitializeOptimizers( - std::vector>* optimizers) const; - // Initialize active optimizers from RewriterConfig optimizer names. - Status InitializeOptimizersByName( - std::vector>* optimizers) const; - - // Run optimization pass over a single GrapplerItem. Meta optimizer might run - // multiple such passes: 1) for the main graph 2) for the function library - Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item, - GraphDef* optimized_graph); - + std::unique_ptr NewOptimizer(const string& optimizer); DeviceBase* const cpu_device_; // may be NULL RewriterConfig cfg_; - - struct OptimizerResult { - string optimizer_name; - string result; - }; - - struct GraphOptimizationResult { - explicit GraphOptimizationResult(const string& id) : id(id) {} - string id; - bool is_optimized = false; - std::vector results; - }; - - std::vector optimization_results_; + std::vector> result_; }; bool MetaOptimizerEnabled(const RewriterConfig& cfg); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc index 8793ad9633c..d9a386b9be2 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc @@ -16,14 +16,11 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" #include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/core/framework/function_testlib.h" -#include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" #include "tensorflow/core/grappler/utils.h" -#include "tensorflow/core/grappler/utils/grappler_test.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -31,8 +28,6 @@ namespace tensorflow { namespace grappler { namespace { -constexpr char kDevice[] = "/device:CPU:0"; - class TestOptimizer : public CustomGraphOptimizer { public: static void SetOptimized(const bool flag_value) { optimized_ = flag_value; } @@ -61,9 +56,7 @@ bool TestOptimizer::optimized_; REGISTER_GRAPH_OPTIMIZER(TestOptimizer); -class MetaOptimizerTest : public GrapplerTest {}; - -TEST_F(MetaOptimizerTest, RunsCustomOptimizer) { +TEST(MetaOptimizerTest, RunsCustomOptimizer) { TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"}); GrapplerItem item; CHECK(fake_input.NextItem(&item)); @@ -79,7 +72,7 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) { EXPECT_TRUE(TestOptimizer::IsOptimized()); } -TEST_F(MetaOptimizerTest, RunOptimizersTwice) { +TEST(MetaOptimizerTest, RunOptimizersTwice) { TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"}); GrapplerItem item; CHECK(fake_input.NextItem(&item)); @@ -93,167 +86,6 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) { TF_EXPECT_OK(status); } -TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) { - using test::function::NDef; - - // Enable ony function optimization. - RewriterConfig rewriter_config; - rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO); - rewriter_config.set_function_optimization(RewriterConfig::ON); - rewriter_config.add_optimizers("function"); - - MetaOptimizer optimizer(nullptr, rewriter_config); - - // Define function library: - // - // MyMul(x, y) = x * y - // *MySquare(x) = MyMul(x, x) - // *MyQuadratic(x) = MySquare(MySquare(x)) - // - // * - marked as noinline - - FunctionDef mul_func = FunctionDefHelper::Create( - "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"}, - {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}}, - /* Mapping between function returns and function node outputs. */ - {{"z", "mul:z:0"}}); - - FunctionDef square_func = FunctionDefHelper::Create( - "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"}, - {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}}, - /* Mapping between function returns and function node outputs. */ - {{"z", "my_mul:z:0"}}); - (*square_func.mutable_attr())["_noinline"].set_b(true); - - FunctionDef quadratic_func = FunctionDefHelper::Create( - "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"}, - {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}}, - {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}}, - /* Mapping between function returns and function node outputs. */ - {{"z", "quadratic:z:0"}}); - (*quadratic_func.mutable_attr())["_noinline"].set_b(true); - - // Tensorflow graph: - // - // a = tf.Placeholder(tf.float); - // b = tf.Placeholder(tf.int32); - // - // square = MySquare(a); // a^2 - // quadratic = MyQuadratic(b); // b^4 - GrapplerItem item; - item.graph = test::function::GDef( - {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice), - NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice), - // Calls into function library - NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice), - NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice), - // Forward outputs - NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice), - NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)}, - // FunctionLib - {mul_func, square_func, quadratic_func}); - - GraphDef output; - TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output)); - - FunctionLibraryDefinition optimized_flib(OpRegistry::Global(), - output.library()); - - // Specialized and optimized functions should be added to the graph. - EXPECT_EQ(6, optimized_flib.num_functions()); - - // MyQuadratic should be specialized once: - // 0. 'quadratic' node in the main graph - const string optimized_0 = "MyQuadratic_specialized_for_quadratic"; - - // MySquare should be specialized and optimized for 3 instantiations: - // 1. 'square' node in the main graph - // 2. 'square' node in the MyQuadratic specialization - // 3. 'quadratic' node in the MyQuadratic specialization - - const string optimized_1 = "MySquare_specialized_for_square"; - const string optimized_2 = "MySquare_specialized_for_square_1"; - const string optimized_3 = "MySquare_specialized_for_quadratic"; - - const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0); - const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1); - const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2); - const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3); - - ASSERT_NE(optimized_func_0, nullptr); - ASSERT_NE(optimized_func_1, nullptr); - ASSERT_NE(optimized_func_2, nullptr); - ASSERT_NE(optimized_func_3, nullptr); - - // Graph should call optimized function. - int count = 0; - for (const NodeDef& node : output.node()) { - if (node.name() == "square" && count++) { - EXPECT_EQ("MySquare_specialized_for_square", node.op()); - } else if (node.name() == "quadratic" && count++) { - EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op()); - } - } - EXPECT_EQ(2, count); - - // Specialized MySquare should call specialized functions. - count = 0; - for (const NodeDef& node : optimized_func_0->node_def()) { - if (node.name() == "square" && count++) { - EXPECT_EQ(optimized_2, node.op()); - } else if (node.name() == "quadratic" && count++) { - EXPECT_EQ(optimized_3, node.op()); - } - } - EXPECT_EQ(2, count); - - const std::vector optimized_funcs = { - optimized_func_1, optimized_func_1, optimized_func_3}; - - // MyMul should be inlined into all optimized versions of MySquare. - for (const FunctionDef* optimized_func : optimized_funcs) { - count = 0; - for (const NodeDef& node : optimized_func->node_def()) { - if (node.name() == "my_mul/inlined_inputs" && count++) { - EXPECT_EQ("IdentityN", node.op()); - EXPECT_EQ(2, node.input_size()); - EXPECT_EQ("x:0", node.input(0)); - EXPECT_EQ("x:0", node.input(1)); - } else if (node.name() == "my_mul/x" && count++) { - EXPECT_EQ("Identity", node.op()); - EXPECT_EQ(1, node.input_size()); - EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0)); - } else if (node.name() == "my_mul/y" && count++) { - EXPECT_EQ("Identity", node.op()); - EXPECT_EQ(1, node.input_size()); - EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0)); - } else if (node.name() == "my_mul/mul" && count++) { - EXPECT_EQ("Mul", node.op()); - EXPECT_EQ(2, node.input_size()); - EXPECT_EQ("my_mul/x:output:0", node.input(0)); - EXPECT_EQ("my_mul/y:output:0", node.input(1)); - } else if (node.name() == "my_mul" && count++) { - EXPECT_EQ("IdentityN", node.op()); - EXPECT_EQ(1, node.input_size()); - EXPECT_EQ("my_mul/mul:z:0", node.input(0)); - } - EXPECT_TRUE(node.device().empty()); - } - EXPECT_EQ(5, count); - } - - item.fetch = {"out_s", "out_q"}; - item.feed.emplace_back("a", test::AsScalar(2.0f)); - item.feed.emplace_back("b", test::AsScalar(4)); - auto tensors_expected = EvaluateFetchNodes(item); - - GrapplerItem optimized(item, std::move(output)); - auto tensors = EvaluateFetchNodes(optimized); - - test::ExpectTensorEqual(tensors_expected[0], tensors[0]); - test::ExpectTensorEqual(tensors_expected[1], tensors[1]); -} - } // namespace } // namespace grappler } // namespace tensorflow diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc index 790809bc670..638fe1999a6 100644 --- a/tensorflow/core/grappler/utils/functions.cc +++ b/tensorflow/core/grappler/utils/functions.cc @@ -545,12 +545,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func, return Status::OK(); } -Status MakeGrapplerFunctionItem(const FunctionDef& func, - const FunctionLibraryDefinition& flib, - GrapplerFunctionItem* item) { - return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item); -} - // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Status RegisterGrapplerFunctionConnectivity( @@ -566,9 +560,9 @@ Status RegisterGrapplerFunctionConnectivity( return Status::OK(); } -Status MakeFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func) { +Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func) { func->mutable_signature()->set_name(item.id); func->mutable_signature()->set_is_stateful(item.is_stateful()); diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h index 5e8b6c69601..ab369bcad7c 100644 --- a/tensorflow/core/grappler/utils/functions.h +++ b/tensorflow/core/grappler/utils/functions.h @@ -38,8 +38,7 @@ using AttrValueMap = std::unordered_map; // function body in place of function inputs and a resolved input data type. struct InputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence inputs of - // different data types. - // TODO(ezhulenev): Support type parametrized inputs? + // different data types string input_name; // name of the function input argument DataType data_type; // input data type bool is_ref; // if true, inputs are required to be refs @@ -54,8 +53,7 @@ struct InputArgExpansion { // tensors of a function body nodes and a resolved output data type struct OutputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence outputs of - // different data types. - // TODO(ezhulenev): Support type parametrized outputs? + // different data types string output_name; // name of the function output argument DataType data_type; // output data type bool is_ref; // if true, outputs are refs @@ -188,6 +186,13 @@ bool HasParametrizedBody(const FunctionDef& func); // Check if function has parametrized type or body. bool IsParametrized(const FunctionDef& func); +// Make a GrapplerFunctionItem from the function definition and attributes. +// Return error if the given function def cannot be converted. +Status MakeGrapplerFunctionItem( + const FunctionDef& func, + const std::unordered_map& func_instantiation_attr, + const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); + // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Use function library definition to // lookup function body nodes output names and ranges. @@ -195,28 +200,11 @@ Status RegisterGrapplerFunctionConnectivity( const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib, GrapplerFunctionConnectivity* connectivity); -// Make a GrapplerFunctionItem from the function definition and function -// instantiation attributes (caller node attributes). Returns error if the given -// function def cannot be converted (e.g. not all attributes are defined). -Status MakeGrapplerFunctionItem( - const FunctionDef& func, - const std::unordered_map& func_instantiation_attr, - const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); - -// Make a GrapplerFunction item from the function definition. Function must be -// fully defined (no type or body parametrization). -// TODO(ezhulenev): Support parametrized functions without fully defined -// instantiation attributes? Do we ever want to optimize parametrized function -// without specializing it to it's instantiation attributes (at least types)? -Status MakeGrapplerFunctionItem(const FunctionDef& func, - const FunctionLibraryDefinition& flib, - GrapplerFunctionItem* item); - -// Make a FunctionDef from the GrapplerFunctionItem. Use function library -// definition to lookup function body nodes output names and ranges. -Status MakeFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func); +// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function +// library definition to lookup function body nodes output names and ranges. +Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func); } // end namespace grappler } // end namespace tensorflow diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc index 6dfd49b9438..54d235a8a46 100644 --- a/tensorflow/core/grappler/utils/functions_test.cc +++ b/tensorflow/core/grappler/utils/functions_test.cc @@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) { EXPECT_EQ("two", cast.input(0)); } -TEST_F(FunctionsTest, MakeFunctionDef) { +TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( // Name @@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeFunctionDef) { TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); FunctionDef specialized; - TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); // Input and output types are resolved based on instantiation attributes. EXPECT_EQ("x", specialized.signature().input_arg(0).name()); @@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeFunctionDef) { EXPECT_EQ(2, count); } -TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) { +TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) { using test::function::NDef; FunctionDef mul_func = FunctionDefHelper::Create( @@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) { // Replace function body with identity function item.SwapFunctionBody(std::move(id_func_body)); FunctionDef specialized; - TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); // Check that graph body was updated. int count = 0; From 2ef955b6d354378a7ca19f1f3cafccfc17f79013 Mon Sep 17 00:00:00 2001 From: Haggai Date: Fri, 20 Apr 2018 18:57:12 -0700 Subject: [PATCH 0563/1734] Abort on invalid fft type or rank --- tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h index 4f6b3633645..0bf693edd0b 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h +++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h @@ -195,6 +195,9 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand, device, static_cast(out), static_cast(operand), input_batch, fft_length0, fft_length1, fft_length2); break; + default: + // Unsupported FFT type + abort(); } } @@ -219,6 +222,9 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand, input_batch, fft_length0, fft_length1, fft_length2); break; + default: + // Unsupported FFT rank + abort(); } } From 82679654af098df1de27bcdcf6fc6942ccf4f236 Mon Sep 17 00:00:00 2001 From: ADiegoCAlonso Date: Sat, 21 Apr 2018 11:43:51 +0200 Subject: [PATCH 0564/1734] Add __init__py --- tensorflow/examples/tutorials/estimators/__init__.py | 0 tensorflow/examples/tutorials/input_fn/__init__.py | 0 tensorflow/examples/tutorials/layers/__init__.py | 0 tensorflow/examples/tutorials/monitors/__init__.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tensorflow/examples/tutorials/estimators/__init__.py create mode 100644 tensorflow/examples/tutorials/input_fn/__init__.py create mode 100644 tensorflow/examples/tutorials/layers/__init__.py create mode 100644 tensorflow/examples/tutorials/monitors/__init__.py diff --git a/tensorflow/examples/tutorials/estimators/__init__.py b/tensorflow/examples/tutorials/estimators/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tensorflow/examples/tutorials/input_fn/__init__.py b/tensorflow/examples/tutorials/input_fn/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tensorflow/examples/tutorials/layers/__init__.py b/tensorflow/examples/tutorials/layers/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/examples/tutorials/monitors/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From aed22c552905d74de04c98b34aabedd12926790a Mon Sep 17 00:00:00 2001 From: ADiegoCAlonso Date: Sat, 21 Apr 2018 11:56:10 +0200 Subject: [PATCH 0565/1734] Specify float32 as float type instead of float64 --- tensorflow/examples/tutorials/monitors/iris_monitors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py index 850d105f7b1..a2b7fe60237 100644 --- a/tensorflow/examples/tutorials/monitors/iris_monitors.py +++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py @@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv") def main(unused_argv): # Load datasets. training_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float) + filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float) + filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) validation_metrics = { "accuracy": @@ -83,7 +83,7 @@ def main(unused_argv): # Classify two new flower samples. new_samples = np.array( - [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float) + [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) y = list(classifier.predict(new_samples)) print("Predictions: {}".format(str(y))) From ddda9acc9b922a9983128fc2e47f3541b8e456bc Mon Sep 17 00:00:00 2001 From: Joe Yearsley Date: Sat, 21 Apr 2018 17:12:37 +0100 Subject: [PATCH 0566/1734] Update fold_old_batch_norms.cc Updated as requested --- tensorflow/tools/graph_transforms/fold_old_batch_norms.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc index 988ba25e366..f1d361e07d8 100644 --- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc +++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc @@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector& scale_values, NodeDef bias_add_node; bias_add_node.set_op("BiasAdd"); bias_add_node.set_name(conv_output_name); - if (HasAttr(conv_node, "data_format")) { + if (!conv_node.attr().count("data_format")) { CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node); } CopyNodeAttr(conv_node, "T", "T", &bias_add_node); From cea18851e2d81ee97ebf8e9f6aeddd55a34e3227 Mon Sep 17 00:00:00 2001 From: foo0x29a Date: Sat, 21 Apr 2018 13:30:52 -0300 Subject: [PATCH 0567/1734] fix typo --- .../core/grappler/optimizers/custom_graph_optimizer_registry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h index 796da913737..3148a5f809f 100644 --- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h +++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h @@ -33,7 +33,7 @@ class CustomGraphOptimizerRegistry { static std::vector GetRegisteredOptimizers(); typedef std::function Creator; - // Regsiter graph optimizer which can be called during program initialization. + // Register graph optimizer which can be called during program initialization. // This class is not thread-safe. static void RegisterOptimizerOrDie(const Creator& optimizer_creator, const string& name); From 364f6eae07fa8f0e2f89a9f665d0af430ea96669 Mon Sep 17 00:00:00 2001 From: Filipe Filardi Date: Sat, 21 Apr 2018 14:45:30 -0300 Subject: [PATCH 0568/1734] Create pull_request_template.md --- pull_request_template.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 pull_request_template.md diff --git a/pull_request_template.md b/pull_request_template.md new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/pull_request_template.md @@ -0,0 +1 @@ + From 31dcaa089bb7e504b85807e9bdb96be2858f1b98 Mon Sep 17 00:00:00 2001 From: Yunxing Dai Date: Fri, 20 Apr 2018 18:31:39 -0700 Subject: [PATCH 0569/1734] [XLA][Doc]Fix up operation semantics of BatchNorm. We somehow committed an old version of the doc (see #, the lhs is what we wanted and the rhs is what got committed). This CL reverts last change to that CL. PiperOrigin-RevId: 193751762 --- .../performance/xla/operation_semantics.md | 56 ++++++++++--------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md index 8373a1219da..f530fe1206c 100644 --- a/tensorflow/docs_src/performance/xla/operation_semantics.md +++ b/tensorflow/docs_src/performance/xla/operation_semantics.md @@ -25,7 +25,7 @@ Calculates gradients of batch norm. `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` | Arguments | Type | Semantics | -| -------------- | ----------------------- | -------------------------------- | +| --------------- | ----------------------- | -------------------------------- | | `operand` | `ComputationDataHandle` | n dimensional array to be | : : : normalized (x) : | `scale` | `ComputationDataHandle` | 1 dimensional array | @@ -45,31 +45,37 @@ feature dimension in `operand`), the operation calculates the gradients with respect to `operand`, `offset` and `scale` across all the other dimensions. The `feature_index` must be a valid index for the feature dimension in `operand`. -The three gradients are defined by the following formulas (Assuming a -4-dimensional tensor as `operand` and (l) is the index for feature dimension): +The three gradients are defined by the following formulas (assuming a +4-dimensional tensor as `operand` and with feature dimension index \\(l\\), +batch size `m` and spatial sizes `w` and `h`): -\\( coef_l = \frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (\nabla y_{ijkl} * (x_{ijkl} - \mu_l) / (\sigma^2_{l}+\epsilon)) \\) +\\[ \begin{split} c_l&= +\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h +\left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right) +\\\\ +\nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}} +\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l}) +\right) +\\\\ +\nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl} +\frac{x_{ijkl} - \mu_l}{\sqrt{\sigma^2_{l}+\epsilon}} \right) +\\\\\ +\nabla \beta_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} +\end{split} \\] -\\( \nabla x_{ijkl} = \gamma_{l} * (1/\sqrt{\sigma^2_{l}+\epsilon}) * [\nabla y_{ijkl} - mean(\nabla y) - (x_{ijkl} - \mu_{l}) * coef_l] \\) - -\\( \nabla \beta_l = \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} \\) - -\\( \nabla \gamma_l = \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl} * ((x_{ijkl} - \mu_l) / \sqrt{\sigma^2_{l}+\epsilon}) \\) - -The inputs `mean` and `variance` represents moments value +The inputs `mean` and `variance` represent moments value across batch and spatial dimensions. The output type is a tuple of three handles: -|Outputs | Type | Semantics | -|------------- | ----------------------- | ------------------------------------ | -|`grad_operand`| `ComputationDataHandle` | gradient with respect to input | -: : : `operand` (\\( \nabla x\\)) : -|`grad_scale` | `ComputationDataHandle` | gradient with respect to input | -: : : `scale` (\\( \nabla \gamma\\)) : -|`grad_offset` | `ComputationDataHandle` | gradient with respect to input | -: : : `offset`(\\( \nabla \beta\\)) : - +| Outputs | Type | Semantics | +| ------------- | ----------------------- | --------------------------------- | +| `grad_operand` | `ComputationDataHandle` | gradient with respect to input | +: : : `operand` (\\( \nabla x\\)) : +| `grad_scale` | `ComputationDataHandle` | gradient with respect to input | +: : : `scale` (\\( \nabla \gamma\\)) : +| `grad_offset` | `ComputationDataHandle` | gradient with respect to input | +: : : `offset`(\\( \nabla \beta\\)) : ## BatchNormInference @@ -440,13 +446,11 @@ area and a computation is performed for each possible position of the window. | `lhs` | `ComputationDataHandle` | rank n+2 array of inputs | | `rhs` | `ComputationDataHandle` | rank n+2 array of kernel | : : : weights : -| `window_strides` | `ArraySlice` | size n array of kernel strides| -| `padding` | `ArraySlice` | n-d array of kernel strides | +| `padding` | `ArraySlice>` : padding : -| `lhs_dilation` | `ArraySlice` | size n lhs dilation factor | -: : : array | -| `rhs_dilation` | `ArraySlice` | size n rhs dilation factor -: : : array | +| `lhs_dilation` | `ArraySlice` | n-d lhs dilation factor array | +| `rhs_dilation` | `ArraySlice` | n-d rhs dilation factor array | Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2 array describing the base area. This is called the input, even though of course From ea3d7ab5455f54a67e24428f159e9170be408d71 Mon Sep 17 00:00:00 2001 From: Filipe Filardi Date: Sat, 21 Apr 2018 14:57:38 -0300 Subject: [PATCH 0570/1734] Create Pull Request Template --- PULL_REQUEST_TEMPLATE.md | 20 ++++++++++++++++++++ pull_request_template.md | 1 - 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 PULL_REQUEST_TEMPLATE.md delete mode 100644 pull_request_template.md diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000000..075bbc99455 --- /dev/null +++ b/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,20 @@ + + +##### Pull Request Checklist + +- [ ] Read [contributing guideline](CONTRIBUTING.md). +- [ ] Read [code of conduct](CODE_OF_CONDUCT.md). +- [ ] Fill [Contributor License Agreement (CLA)](https://cla.developers.google.com/). +- [ ] Check if my changes are consistent with the [guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#general-guidelines-and-philosophy-for-contribution). +- [ ] Changes are consistent with the [Coding Style](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#c-coding-style) +- [ ] Run [Unit Tests](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#running-unit-tests). + +##### Issue Fix + +- [ ] Yes +- [ ] No + +Fixed issue: + +##### Description + diff --git a/pull_request_template.md b/pull_request_template.md deleted file mode 100644 index 8b137891791..00000000000 --- a/pull_request_template.md +++ /dev/null @@ -1 +0,0 @@ - From 2b5d4f794cc9c2740d27c0e8c1af2b511810e00b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 18:37:55 -0700 Subject: [PATCH 0571/1734] [XLA] Redesign: implement XlaComputation::Snapshot, and Client::LoadSnapshot. PiperOrigin-RevId: 193752146 --- tensorflow/compiler/xla/client/client.cc | 5 +++++ tensorflow/compiler/xla/client/client.h | 3 +++ tensorflow/compiler/xla/client/xla_client/BUILD | 2 +- .../compiler/xla/client/xla_client/xla_computation.cc | 11 +++++++++++ .../compiler/xla/client/xla_client/xla_computation.h | 4 ++++ tensorflow/compiler/xla/service/executable.cc | 6 +++--- tensorflow/compiler/xla/service/executable.h | 4 ++-- tensorflow/compiler/xla/service/hlo.proto | 2 +- 8 files changed, 30 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc index f0f94298a05..328e1b8fa84 100644 --- a/tensorflow/compiler/xla/client/client.cc +++ b/tensorflow/compiler/xla/client/client.cc @@ -235,6 +235,11 @@ StatusOr Client::LoadSnapshot(const SessionModule& module) { return Computation(stub_, response.computation()); } +StatusOr Client::LoadSnapshot(const HloSnapshot& module) { + TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module()); + return XlaComputation(module.hlo().hlo_module()); +} + StatusOr> Client::Execute( const Computation& computation, tensorflow::gtl::ArraySlice arguments, diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h index 14c685d94ea..a63ff4c56d1 100644 --- a/tensorflow/compiler/xla/client/client.h +++ b/tensorflow/compiler/xla/client/client.h @@ -255,6 +255,9 @@ class Client { StatusOr LoadSnapshot(const SessionModule& module); + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + StatusOr LoadSnapshot(const HloSnapshot& module); + ServiceInterface* stub() { return stub_; } private: diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD index 31fa1241ee4..0d6e207971e 100644 --- a/tensorflow/compiler/xla/client/xla_client/BUILD +++ b/tensorflow/compiler/xla/client/xla_client/BUILD @@ -31,9 +31,9 @@ cc_library( hdrs = ["xla_computation.h"], deps = [ "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", - "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc index a6752c60102..72e3935696e 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc @@ -17,7 +17,9 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { @@ -26,4 +28,13 @@ StatusOr XlaComputation::GetProgramShape() const { return proto_.program_shape(); } +StatusOr> XlaComputation::Snapshot() const { + if (IsNull()) { + return InvalidArgument("Computation is invalid."); + } + auto session = MakeUnique(); + *session->mutable_hlo()->mutable_hlo_module() = proto_; + return std::move(session); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h index 7ad212aa24c..b70b57e9ffe 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h @@ -48,6 +48,10 @@ class XlaComputation { const HloModuleProto& proto() const { return proto_; } + // Requests that we snapshot the computation into a serializable protocol + // buffer form. + StatusOr> Snapshot() const; + // Returns true if this object is a null Computation. bool IsNull() const { return unique_id_ == -1; } diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 8218b5f7c87..be19b3ff04c 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -163,9 +163,9 @@ Status Executable::DumpSessionModule() { result); } -/* static */ Status Executable::DumpToDirectory(const string& directory_path, - string filename, - const HloSession& hlo_session) { +/* static */ Status Executable::DumpToDirectory( + const string& directory_path, string filename, + const HloSnapshot& hlo_session) { tensorflow::Env* env = tensorflow::Env::Default(); if (!env->IsDirectory(directory_path).ok()) { // NB! CreateDir does not work reliably with multiple XLA threads -- two diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index bdbe119120f..0c95f1a3611 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -156,9 +156,9 @@ class Executable { static Status DumpToDirectory(const string& directory_path, string filename, const SessionModule& session_module); - // Dump hlo_session to directory_path/filename. + // Dump hlo snapshot to directory_path/filename. static Status DumpToDirectory(const string& directory_path, string filename, - const HloSession& hlo_session); + const HloSnapshot& hlo_session); protected: mutable tensorflow::mutex mutex_; diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 0c3eb7dcb44..aa6860880b7 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -300,7 +300,7 @@ message HloProto { // Encapsulates HloProto together with the arguments, result, and // execution_platform. This message is used for purposes such as // analysis/replay/file-storage. -message HloSession { +message HloSnapshot { // The hlo graph. HloProto hlo = 1; From 1796d17b8b1fa598627a590fad0ef81d138af558 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 20 Apr 2018 20:11:19 -0700 Subject: [PATCH 0572/1734] Fix heuristic for computing gradients of gradients when there are outside_compilation clusters present, to stop creating cycles. PiperOrigin-RevId: 193757109 --- tensorflow/contrib/tpu/python/tpu/tpu.py | 38 +++++++----------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index a1690dadffe..7b8786304cc 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -173,36 +173,18 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): # gradients, and put the gradient of X in cluster # 'root_cluster.gradient_uid'. # - # When the gradient code adds multiple Ops, it asks them to - # be colocated either with the original Op X, or with one of - # the preceding Ops that was added to the gradient. In other - # words, we want to detect the case where we are colocating - # with an Op that is in cluster root_cluster.gradient_uid - # and put the new Op in that same cluster if the - # gradient_uid is the same (the case that we are in the same - # invocation of gradients, and just adding new Ops to the - # cluster); and in a different cluster if the gradient_uids - # are different (the case that we are in a new invocation of - # gradients, taking the gradient of a previously-computed - # gradient). + # When taking a gradient of a gradient, some ops will be + # colocated with Op in the forward pass (e.g., cluster + # root_cluster) and some in the backward pass (e.g., cluster + # root_cluster.initial_gradient_uid). We need all of the + # grad-of-grad ops to be in the same cluster to avoid cyclic + # dependencies between clusters. We adopt a heuristic that + # puts any op clustered with root_cluster. in + # root_cluster.gradient_uid, even if xxx was + # initial_gradient_uid. self._in_gradient_colocation = op parts = outside_attr.split(".") - if len(parts) > 1: - uid = parts[-1] - if uid == gradient_uid: - # Keep using the same cluster - cluster = outside_attr - else: - # We're taking the gradient of a gradient so make a new - # cluster attr, adding a new '.uid' on the end to - # preserve the invariant that the gradient_uid is the - # suffix after the last '.' in the attr. - cluster = outside_attr + "." + gradient_uid - else: - # We're taking the gradient of an Op in the forward pass, so - # make a new cluster combining the Op's cluster and the - # gradient id. - cluster = outside_attr + "." + gradient_uid + cluster = parts[0] + "." + gradient_uid self._EnterOutsideCompilationScope(cluster=cluster) except ValueError: # The attr was not present: do nothing. From 28b8a3c74f93f9238fa626ec7d32fbddcb56b0a8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 21 Apr 2018 08:16:47 -0700 Subject: [PATCH 0573/1734] Allow output has a different shape from input in the image.transform (#17011). PiperOrigin-RevId: 193788768 --- tensorflow/contrib/image/kernels/image_ops.cc | 7 ++- tensorflow/contrib/image/kernels/image_ops.h | 2 +- tensorflow/contrib/image/ops/image_ops.cc | 54 +++++++++++++++++-- .../python/kernel_tests/image_ops_test.py | 30 +++++++++++ .../contrib/image/python/ops/image_ops.py | 39 ++++++++------ 5 files changed, 108 insertions(+), 24 deletions(-) diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc index c2e32da133b..ae4b1ba62a8 100644 --- a/tensorflow/contrib/image/kernels/image_ops.cc +++ b/tensorflow/contrib/image/kernels/image_ops.cc @@ -70,6 +70,7 @@ class ImageProjectiveTransform : public OpKernel { void Compute(OpKernelContext* ctx) override { const Tensor& images_t = ctx->input(0); const Tensor& transform_t = ctx->input(1); + const Tensor& output_dim = ctx->input(2); OP_REQUIRES(ctx, images_t.shape().dims() == 4, errors::InvalidArgument("Input images must have rank 4")); OP_REQUIRES(ctx, @@ -83,7 +84,11 @@ class ImageProjectiveTransform : public OpKernel { auto images = images_t.tensor(); auto transform = transform_t.matrix(); Tensor* output_t; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t)); + // Image is NHWC format. + auto output_shape = images_t.shape(); + output_shape.set_dim(1, output_dim.vec()(0)); + output_shape.set_dim(2, output_dim.vec()(1)); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t)); auto output = output_t->tensor(); (FillProjectiveTransform(interpolation_))( ctx->eigen_device(), &output, images, transform); diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h index ad501330617..2320329b923 100644 --- a/tensorflow/contrib/image/kernels/image_ops.h +++ b/tensorflow/contrib/image/kernels/image_ops.h @@ -161,7 +161,7 @@ struct FillProjectiveTransform { void operator()(const Device& device, OutputType* output, const InputType& images, const TransformsType& transform) const { - output->device(device) = images.generate( + output->device(device) = output->generate( ProjectiveGenerator(images, transform, interpolation_)); } }; diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc index 68771b3d054..e97267fb89f 100644 --- a/tensorflow/contrib/image/ops/image_ops.cc +++ b/tensorflow/contrib/image/ops/image_ops.cc @@ -19,9 +19,55 @@ limitations under the License. namespace tensorflow { +using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; +namespace { + +// Sets output[0] to shape [batch_dim,height,width,channel_dim], where +// height and width come from the size_tensor. +Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim, + int size_input_idx, DimensionHandle channel_dim) { + // Verify shape of size input. + ShapeHandle size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused)); + + // Get size values from the size tensor. + const Tensor* size_tensor = c->input_tensor(size_input_idx); + DimensionHandle width; + DimensionHandle height; + if (size_tensor == nullptr) { + width = c->UnknownDim(); + height = c->UnknownDim(); + } else { + // TODO(petewarden) - Remove once we have constant evaluation in C++ only. + if (size_tensor->dtype() != DT_INT32) { + return errors::InvalidArgument( + "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 " + "but got ", + DataTypeString(size_tensor->dtype()), " for input #", size_input_idx, + " in ", c->DebugString()); + } + auto vec = size_tensor->vec(); + height = c->MakeDim(vec(0)); + width = c->MakeDim(vec(1)); + } + c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim})); + return Status::OK(); +} + +Status ResizeShapeFn(InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */, + c->Dim(input, 3)); +} + +} // namespace + // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc. // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0). // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to @@ -29,13 +75,11 @@ using shape_inference::ShapeHandle; REGISTER_OP("ImageProjectiveTransform") .Input("images: dtype") .Input("transforms: float32") + .Input("output_shape: int32") .Attr("dtype: {uint8, int32, int64, float32, float64}") .Attr("interpolation: string") .Output("transformed_images: dtype") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); - }) + .SetShapeFn(ResizeShapeFn) .Doc(R"doc( Applies the given transform to each of the images. @@ -49,7 +93,7 @@ If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps the *output* point `(x, y)` to a transformed *input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input -image, the output pixel is set to 0. The output is the same size as the input, +image, the output pixel is set to 0. images: 4D `Tensor`, input image(s) in NHWC format. transforms: 2D `Tensor`, projective transform(s) to apply to the image(s). diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index b50177ae565..c0151d320f9 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -195,10 +195,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase): x_init_value=test_image) self.assertLess(left_err, 1e-10) + def _test_grad_different_shape(self, input_shape, output_shape): + with self.test_session(): + test_image_shape = input_shape + test_image = np.random.randn(*test_image_shape) + test_image_tensor = constant_op.constant( + test_image, shape=test_image_shape) + test_transform = image_ops.angles_to_projective_transforms( + np.pi / 2, 4, 4) + + if len(output_shape) == 2: + resize_shape = output_shape + elif len(output_shape) == 3: + resize_shape = output_shape[0:2] + elif len(output_shape) == 4: + resize_shape = output_shape[1:3] + output = image_ops.transform( + images=test_image_tensor, + transforms=test_transform, + output_shape=resize_shape) + left_err = gradient_checker.compute_gradient_error( + test_image_tensor, + test_image_shape, + output, + output_shape, + x_init_value=test_image) + self.assertLess(left_err, 1e-10) + def test_grad(self): self._test_grad([16, 16]) self._test_grad([4, 12, 12]) self._test_grad([3, 4, 12, 12]) + self._test_grad_different_shape([16, 16], [8, 8]) + self._test_grad_different_shape([4, 12, 3], [8, 24, 3]) + self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3]) class BipartiteMatchTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index c139ae89d8d..a8d8cf8c5c6 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -212,7 +212,11 @@ def translations_to_projective_transforms(translations, name=None): axis=1) -def transform(images, transforms, interpolation="NEAREST", name=None): +def transform(images, + transforms, + interpolation="NEAREST", + output_shape=None, + name=None): """Applies the given transform(s) to the image(s). Args: @@ -229,6 +233,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None): the transform mapping input points to output points. Note that gradients are not backpropagated into transformation parameters. interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR". + output_shape: Output dimesion after the transform, [height, width]. + If None, output is the same size as input image. + + name: The name of the op. Returns: Image(s) with the same type and shape as `images`, with the given @@ -255,6 +263,13 @@ def transform(images, transforms, interpolation="NEAREST", name=None): else: raise TypeError("Images should have rank between 2 and 4.") + if output_shape is None: + output_shape = array_ops.shape(images)[1:3] + elif len(output_shape) != 2: + raise TypeError( + "output_shape must either be None or a vector of 2 elements. %s" % + str(output_shape)) + if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif transform_or_transforms.get_shape().ndims is None: @@ -265,7 +280,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None): else: raise TypeError("Transforms should have rank 1 or 2.") output = gen_image_ops.image_projective_transform( - images, transforms, interpolation=interpolation.upper()) + images, transforms, output_shape, interpolation=interpolation.upper()) if len(image_or_images.get_shape()) == 2: return output[0, :, :, 0] elif len(image_or_images.get_shape()) == 3: @@ -375,14 +390,6 @@ def _image_projective_transform_grad(op, grad): if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES: raise TypeError("Invalid dtype %s." % image_or_images.dtype) - if len(image_or_images.get_shape()) == 2: - images = image_or_images[None, :, :, None] - elif len(image_or_images.get_shape()) == 3: - images = image_or_images[None, :, :, :] - elif len(image_or_images.get_shape()) == 4: - images = image_or_images - else: - raise TypeError("Images should have rank between 2 and 4") if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif len(transform_or_transforms.get_shape()) == 2: @@ -395,13 +402,11 @@ def _image_projective_transform_grad(op, grad): inverse = linalg_ops.matrix_inverse(transforms) transforms = matrices_to_flat_transforms(inverse) output = gen_image_ops.image_projective_transform( - grad, transforms, interpolation=interpolation) - if len(image_or_images.get_shape()) == 2: - return [output[0, :, :, 0], None] - elif len(image_or_images.get_shape()) == 3: - return [output[0, :, :, :], None] - else: - return [output, None] + images=grad, + transforms=transforms, + output_shape=array_ops.shape(image_or_images)[1:3], + interpolation=interpolation) + return [output, None, None] def bipartite_match(distance_mat, From fe4146d884c8805fceaa6d73d0bcc7fbf21df7cd Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 21 Apr 2018 18:42:03 +0000 Subject: [PATCH 0574/1734] Update .gitignore for cmake generated files After running cmake on Linux with: ``` tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh ``` the following file is left: ``` ubuntu@ubuntu:~/tensorflow$ git status On branch master Your branch is up-to-date with 'origin/master'. Untracked files: (use "git add ..." to include in what will be committed) api_init_files_list.txt nothing added to commit but untracked files present (use "git add" to track) ubuntu@ubuntu:~/tensorflow$ ``` This fix updates the .gitignore file so that cmake generated files is not added with git inadvertently. Signed-off-by: Yong Tang --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index be75938ec40..828bbe9bd33 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ Podfile.lock /tensorflow/contrib/lite/examples/ios/simple/data/*.txt /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite xcuserdata/** +/api_init_files_list.txt # Android .gradle From 8f558d67450f3ec6aa0d96af9fad84042d6b79df Mon Sep 17 00:00:00 2001 From: AG Ramesh Date: Sat, 21 Apr 2018 15:25:37 -0700 Subject: [PATCH 0575/1734] Changed calls to the depreacted StringPiece::contains with str_util::StrContains --- tensorflow/core/graph/mkl_layout_pass.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 5368774f2d2..72a13d4da7a 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -547,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // If Op has been specifically assigned to a non-CPU device, then No. if (!n->assigned_device_name().empty() && - !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) { + !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) { result = false; reason = "Op has been assigned a runtime device that is not CPU."; } // If user has specifically assigned this op to a non-CPU device, then No. if (!n->def().device().empty() && - !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) { + !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) { result = false; reason = "User has assigned a device that is not CPU."; } @@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // If Op has been specifically assigned to a non-CPU device, then No. if (!n->assigned_device_name().empty() && - !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) { + !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) { result = false; reason = "Op has been assigned a runtime device that is not CPU."; } // If user has specifically assigned this op to a non-CPU device, then No. if (!n->def().device().empty() && - !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) { + !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) { result = false; reason = "User has assigned a device that is not CPU."; } From 5518db48074c3bd125089bccc3edec03c192bf56 Mon Sep 17 00:00:00 2001 From: Bryan Heden Date: Sat, 21 Apr 2018 19:45:42 -0500 Subject: [PATCH 0576/1734] update $ source spacing When viewing install_linux, the spacing was off for 'Next Steps' section. --- tensorflow/docs_src/install/install_linux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index 1a349f54120..02af21bcf23 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -231,7 +231,7 @@ Note that you must activate the Virtualenv environment each time you use TensorFlow. If the Virtualenv environment is not currently active, invoke one of the following commands: -
 $ source ~/tensorflow/bin/activate      # bash, sh, ksh, or zsh
+
$ source ~/tensorflow/bin/activate      # bash, sh, ksh, or zsh
 $ source ~/tensorflow/bin/activate.csh  # csh or tcsh
When the Virtualenv environment is active, you may run From 5b7b354efe3eff5756623b04b87b4cd5272f82cc Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Sat, 21 Apr 2018 21:37:48 -0700 Subject: [PATCH 0577/1734] [XLA] Add an option to the CSE pass to ignore non-fusion computations PiperOrigin-RevId: 193814728 --- tensorflow/compiler/xla/service/hlo_cse.cc | 4 ++++ tensorflow/compiler/xla/service/hlo_cse.h | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc index cd7cbbdd717..3b22c93733a 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.cc +++ b/tensorflow/compiler/xla/service/hlo_cse.cc @@ -97,6 +97,10 @@ StatusOr HloCSE::Run(HloModule* module) { const std::function eq_computations = std::equal_to(); for (auto* computation : module->computations()) { + if (only_fusion_computations_ && !computation->IsFusionComputation()) { + continue; + } + changed |= CombineConstants(computation, is_layout_sensitive_); std::list post_order = diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h index 70096e07a24..5e2b348bdda 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.h +++ b/tensorflow/compiler/xla/service/hlo_cse.h @@ -29,9 +29,11 @@ class HloCSE : public HloPassInterface { public: // If is_layout_sensitive is true, then the simplifier preserves layout during // transformation. Otherwise, layout is ignored. - explicit HloCSE(bool is_layout_sensitive) - : is_layout_sensitive_(is_layout_sensitive) {} - ~HloCSE() override {} + explicit HloCSE(bool is_layout_sensitive, + bool only_fusion_computations = false) + : is_layout_sensitive_(is_layout_sensitive), + only_fusion_computations_(only_fusion_computations) {} + ~HloCSE() override = default; tensorflow::StringPiece name() const override { return "cse"; } // Run CSE on the given module. Returns whether the module was changed (common @@ -39,7 +41,8 @@ class HloCSE : public HloPassInterface { StatusOr Run(HloModule* module) override; private: - bool is_layout_sensitive_; + const bool is_layout_sensitive_; + const bool only_fusion_computations_; }; } // namespace xla From 292d9b92c93e97e98284787a1a60c30553fee5cb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 22 Apr 2018 07:13:16 -0700 Subject: [PATCH 0578/1734] Fixed typo in crossed column code snippet. PiperOrigin-RevId: 193838865 --- tensorflow/docs_src/get_started/feature_columns.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md index d8e4bec8635..9c777a0077a 100644 --- a/tensorflow/docs_src/get_started/feature_columns.md +++ b/tensorflow/docs_src/get_started/feature_columns.md @@ -364,7 +364,7 @@ def make_dataset(latitude, longitude, labels): return tf.data.Dataset.from_tensor_slices((features, labels)) -# Bucketize the latitude and longitude usig the `edges` +# Bucketize the latitude and longitude using the `edges` latitude_bucket_fc = tf.feature_column.bucketized_column( tf.feature_column.numeric_column('latitude'), list(atlanta.latitude.edges)) From e1722aa3197b3942add6b9fb78ed50e21af693ff Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 22 Apr 2018 07:29:33 -0700 Subject: [PATCH 0579/1734] Multi-thread implementation of ExperimentalShuffledFullyConnected using the gemmlowp threadpool. PiperOrigin-RevId: 193839485 --- .../internal/optimized/optimized_ops.h | 146 +++++++++++++----- 1 file changed, 111 insertions(+), 35 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index d2690568006..2e2721e0930 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -1203,39 +1203,16 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, output_activation_max, output_data, output_dims, gemm_context); } -inline void ExperimentalShuffledFullyConnected( - const uint8* input_data, const Dims<4>& input_dims, - const uint8* shuffled_weights_data, const Dims<4>& weights_dims, - const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, - int output_shift, int32 output_activation_min, int32 output_activation_max, - int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { - gemmlowp::ScopedProfilingLabel label( - "ExperimentalShuffledFullyConnected/8bit"); - (void)gemm_context; // only used in optimized code. - TFLITE_DCHECK_EQ(output_activation_min, -32768); - TFLITE_DCHECK_EQ(output_activation_max, 32767); - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * - ArraySize(output_dims, 3); - const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); - const int accum_depth = ArraySize(weights_dims, 0); - TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); - TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); +// Internal function doing the actual arithmetic work for +// ExperimentalShuffledFullyConnected. +// May be called either directly by it (single-threaded case) or may be used +// as the 'task' for worker threads to run (multi-threaded case, see +// ExperimentalShuffledFullyConnectedWorkerTask below). +inline void ExperimentalShuffledFullyConnectedWorkerImpl( + const uint8* input_data, const int8* shuffled_weights_data, + int output_depth, int accum_depth, const int32* bias_data, + int32 output_multiplier, int output_shift, int16* output_data) { + const int8* shuffled_weights_ptr = shuffled_weights_data; #if defined USE_NEON // We'll only need to xor signbit to the input activation values, as // that xor-ing is pre-built into the shuffled weights values. @@ -1331,14 +1308,113 @@ inline void ExperimentalShuffledFullyConnected( acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); output_data[c + i] = acc; } } #endif } +// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class +// to allow using gemmlowp's threadpool. +struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { + ExperimentalShuffledFullyConnectedWorkerTask( + const uint8* input_data, const int8* shuffled_weights_data, + int output_depth, int accum_depth, const int32* bias_data, + int32 output_multiplier, int output_shift, int16* output_data) + : input_data_(input_data), + shuffled_weights_data_(shuffled_weights_data), + output_depth_(output_depth), + accum_depth_(accum_depth), + bias_data_(bias_data), + output_multiplier_(output_multiplier), + output_shift_(output_shift), + output_data_(output_data) {} + + void Run() override { + ExperimentalShuffledFullyConnectedWorkerImpl( + input_data_, shuffled_weights_data_, output_depth_, accum_depth_, + bias_data_, output_multiplier_, output_shift_, output_data_); + } + + const uint8* input_data_; + const int8* shuffled_weights_data_; + int output_depth_; + int accum_depth_; + const int32* bias_data_; + int32 output_multiplier_; + int output_shift_; + int16* output_data_; +}; + +inline void ExperimentalShuffledFullyConnected( + const uint8* input_data, const Dims<4>& input_dims, + const uint8* shuffled_weights_data, const Dims<4>& weights_dims, + const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, + int output_shift, int32 output_activation_min, int32 output_activation_max, + int16* output_data, const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label( + "ExperimentalShuffledFullyConnected/8bit"); + (void)gemm_context; // only used in optimized code. + TFLITE_DCHECK_EQ(output_activation_min, -32768); + TFLITE_DCHECK_EQ(output_activation_max, 32767); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * + ArraySize(output_dims, 3); + const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); + const int accum_depth = ArraySize(weights_dims, 0); + TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); + TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); + // The experimental shuffling is an optimization for matrix*vector product. + // We aren't interested in supporting non-matrix*vector-product cases, i.e. + // batches>1. + TFLITE_DCHECK_EQ(batches, 1); + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* int8_shuffled_weights_data = + reinterpret_cast(shuffled_weights_data); + + // Our GEMV kernel has 4 rows. This doesn't matter in practice for GEMV + // shapes, gemmlowp::HowManyThreads only takes that parameter because it + // matters for other kinds of GEMM shapes. + static constexpr int kKernelRows = 4; + const int thread_count = gemmlowp::HowManyThreads( + gemm_context->max_num_threads(), output_depth, 1, accum_depth); + if (thread_count == 1) { + // Single-thread case: do the computation on the current thread, don't + // use a threadpool + ExperimentalShuffledFullyConnectedWorkerImpl( + input_data, int8_shuffled_weights_data, output_depth, accum_depth, + bias_data, output_multiplier, output_shift, output_data); + return; + } + + // Multi-threaded case: use the gemmlowp context's threadpool. + TFLITE_DCHECK_GT(thread_count, 1); + std::vector tasks(thread_count); + const int kRowsPerWorker = + gemmlowp::RoundUp(output_depth / thread_count); + int row_start = 0; + for (int i = 0; i < thread_count; i++) { + int row_end = std::min(output_depth, row_start + kRowsPerWorker); + tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( + input_data, int8_shuffled_weights_data + row_start * accum_depth, + row_end - row_start, accum_depth, bias_data + row_start, + output_multiplier, output_shift, output_data + row_start); + row_start = row_end; + } + TFLITE_DCHECK_EQ(row_start, output_depth); + gemm_context->workers_pool()->Execute(tasks); +} + template inline void ExtractPatchIntoBufferColumn( const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, From bfffd2041106dac5b7bb3efcbb311a20505ac61f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 14:43:21 +0000 Subject: [PATCH 0580/1734] Update docs to add note and examples for tf.count_nonzero with string Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 31ce83905b0..30ac001c251 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1466,9 +1466,18 @@ def count_nonzero(input_tensor, tf.count_nonzero(x, [0, 1]) # 3 ``` + **NOTE** Strings are compared against zero-length empty string `""`. Any + string with a size greater than zero is already considered as nonzero. + + For example: + ```python + x = tf.constant(["", "a", " ", "b", ""]) + tf.count_nonzero(x) # 3, with "a", " ", and "b" as nonzero strings. + ``` + Args: - input_tensor: The tensor to reduce. Should be of numeric type, `string`, - or `bool`. + input_tensor: The tensor to reduce. Should be of numeric type, `bool`, + or `string`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. From 522e20ef9cff8a7a49322c6442d940aa556222c0 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 22 Apr 2018 09:15:38 -0700 Subject: [PATCH 0581/1734] Change refs/unrefs of FLR. PiperOrigin-RevId: 193843055 --- tensorflow/core/common_runtime/function.cc | 52 ++++++++++--------- .../core/common_runtime/function_test.cc | 27 ++-------- .../function_threadpool_test.cc | 14 +---- .../process_function_library_runtime.cc | 21 +------- .../process_function_library_runtime.h | 3 -- .../process_function_library_runtime_test.cc | 10 ++-- 6 files changed, 38 insertions(+), 89 deletions(-) diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index d310520ebde..a6f637b4883 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -209,6 +209,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { // The instantiated and transformed function is encoded as a Graph // object, and an executor is created for the graph. struct Item : public core::RefCounted { + bool invalidated = false; const Graph* graph = nullptr; // Owned by exec. const FunctionLibraryDefinition* overlay_lib = nullptr; // Not owned. FunctionBody* func_graph = nullptr; @@ -284,15 +285,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl( } FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() { - // The most common patterns of FLR usage don't require the caller to - // explicitly release handles. As a result, we try to unref each item until - // it's erased. - for (auto item : items_) { - if (item.second) { - while (!item.second->Unref()) { - } - } - } + for (auto p : items_) p.second->Unref(); } // An asynchronous op kernel which executes an instantiated function @@ -497,24 +490,30 @@ Status FunctionLibraryRuntimeImpl::Instantiate( options_copy.target = device_name_; const string key = Canonicalize(function_name, attrs, options_copy); + Handle found_handle = kInvalidHandle; { mutex_lock l(mu_); - *handle = parent_->GetHandle(key); - if (*handle != kInvalidHandle) { + found_handle = parent_->GetHandle(key); + if (found_handle != kInvalidHandle) { FunctionLibraryRuntime::LocalHandle handle_on_device = - parent_->GetHandleOnDevice(device_name_, *handle); + parent_->GetHandleOnDevice(device_name_, found_handle); if (handle_on_device == kInvalidLocalHandle) { return errors::Internal("LocalHandle not found for handle ", *handle, "."); } - auto item_handle = items_.find(handle_on_device); - if (item_handle == items_.end()) { + auto iter = items_.find(handle_on_device); + if (iter == items_.end()) { return errors::Internal("LocalHandle ", handle_on_device, - " for handle ", *handle, + " for handle ", found_handle, " not found in items."); } - item_handle->second->Ref(); - return Status::OK(); + Item* item = iter->second; + if (!item->invalidated) { + *handle = found_handle; + return Status::OK(); + } + // *item is invalidated. Fall through and instantiate the given + // function_name/attrs/option again. } } @@ -546,10 +545,10 @@ Status FunctionLibraryRuntimeImpl::Instantiate( { mutex_lock l(mu_); - *handle = parent_->GetHandle(key); - if (*handle != kInvalidHandle) { + Handle found_handle_again = parent_->GetHandle(key); + if (found_handle_again != found_handle) { delete fbody; - items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref(); + *handle = found_handle_again; } else { *handle = parent_->AddHandle(key, device_name_, next_handle_); Item* item = new Item; @@ -566,16 +565,12 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) { if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) { return parent_->ReleaseHandle(handle); } - LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle); CHECK_NE(h, kInvalidLocalHandle); mutex_lock l(mu_); CHECK_EQ(1, items_.count(h)); Item* item = items_[h]; - if (item->Unref()) { - items_.erase(h); - TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle)); - } + item->invalidated = true; // Reinstantiate later. return Status::OK(); } @@ -736,6 +731,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, // computation is done and stored in *rets, we send the return values back // to the source_device (caller) so that the ProcFLR can receive them later. std::vector* remote_args = new std::vector; + item->Ref(); ProcessFunctionLibraryRuntime::ReceiveTensorsAsync( source_device, target_device, "arg_", src_incarnation, args.size(), device_context, {}, rendezvous, remote_args, @@ -747,6 +743,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, s = frame->SetArgs(*remote_args); } if (!s.ok()) { + item->Unref(); delete frame; delete remote_args; delete exec_args; @@ -757,6 +754,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, *exec_args, [item, frame, rets, done, source_device, target_device, target_incarnation, rendezvous, device_context, remote_args, exec_args](const Status& status) { + core::ScopedUnref unref(item); Status s = status; if (s.ok()) { s = frame->ConsumeRetvals(rets); @@ -842,11 +840,13 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, return; } + item->Ref(); item->exec->RunAsync( // Executor args *exec_args, // Done callback. [item, frame, rets, done, exec_args](const Status& status) { + core::ScopedUnref unref(item); Status s = status; if (s.ok()) { s = frame->ConsumeRetvals(rets); @@ -906,6 +906,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, exec_args->runner = *run_opts.runner; exec_args->call_frame = frame; + item->Ref(); item->exec->RunAsync( // Executor args *exec_args, @@ -914,6 +915,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, [item, frame, exec_args](DoneCallback done, // Start unbound arguments. const Status& status) { + core::ScopedUnref unref(item); delete exec_args; done(status); }, diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc index 61b2f0e60f7..373fc64007e 100644 --- a/tensorflow/core/common_runtime/function_test.cc +++ b/tensorflow/core/common_runtime/function_test.cc @@ -231,19 +231,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test { return status; } FunctionLibraryRuntime::Options opts; - status = Run(flr, handle, opts, args, rets, add_runner); - if (!status.ok()) return status; - - // Release the handle and try running again. It should not succeed. - status = flr->ReleaseHandle(handle); - if (!status.ok()) return status; - - Status status2 = Run(flr, handle, opts, args, std::move(rets)); - EXPECT_TRUE(errors::IsInvalidArgument(status2)); - EXPECT_TRUE( - str_util::StrContains(status2.error_message(), "remote execution.")); - - return status; + TF_RETURN_IF_ERROR(Run(flr, handle, opts, args, rets, add_runner)); + return flr->ReleaseHandle(handle); } Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle, @@ -304,16 +293,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test { *rets[i] = retvals[i]; } - // Release the handle and try running again. It should not succeed. - status = flr->ReleaseHandle(handle); - if (!status.ok()) return status; - - Status status2 = Run(flr, handle, opts, args, std::move(rets)); - EXPECT_TRUE(errors::IsInvalidArgument(status2)); - EXPECT_TRUE( - str_util::StrContains(status2.error_message(), "remote execution.")); - - return status; + // Release the handle. + return flr->ReleaseHandle(handle); } std::unique_ptr GetFuncBody(FunctionLibraryRuntime* flr, diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc index 2d09e83d013..98dac38a8cb 100644 --- a/tensorflow/core/common_runtime/function_threadpool_test.cc +++ b/tensorflow/core/common_runtime/function_threadpool_test.cc @@ -144,19 +144,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test { return status; } FunctionLibraryRuntime::Options opts; - status = Run(flr, handle, opts, args, rets, add_runner); - if (!status.ok()) return status; - - // Release the handle and try running again. It should not succeed. - status = flr->ReleaseHandle(handle); - if (!status.ok()) return status; - - Status status2 = Run(flr, handle, opts, args, std::move(rets)); - EXPECT_TRUE(errors::IsInvalidArgument(status2)); - EXPECT_TRUE( - str_util::StrContains(status2.error_message(), "remote execution.")); - - return status; + return Run(flr, handle, opts, args, std::move(rets), add_runner); } Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle, diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc index d05f146f21a..e61ed8c4794 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc @@ -181,12 +181,7 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle( const string& function_key, const string& device_name, FunctionLibraryRuntime::LocalHandle local_handle) { mutex_lock l(mu_); - FunctionLibraryRuntime::Handle h = - gtl::FindWithDefault(table_, function_key, kInvalidHandle); - if (h != kInvalidHandle) { - if (function_data_.count(h) != 0) return h; - } - h = next_handle_; + auto h = next_handle_; FunctionData* fd = new FunctionData(device_name, local_handle); function_data_[h] = std::unique_ptr(fd); table_[function_key] = h; @@ -197,12 +192,7 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle( FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle( const string& function_key) const { mutex_lock l(mu_); - FunctionLibraryRuntime::Handle h = - gtl::FindWithDefault(table_, function_key, kInvalidHandle); - if (h != kInvalidHandle) { - if (function_data_.count(h) == 0) return kInvalidHandle; - } - return h; + return gtl::FindWithDefault(table_, function_key, kInvalidHandle); } bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice( @@ -272,13 +262,6 @@ Status ProcessFunctionLibraryRuntime::Instantiate( return Status::OK(); } -Status ProcessFunctionLibraryRuntime::RemoveHandle( - FunctionLibraryRuntime::Handle handle) { - mutex_lock l(mu_); - function_data_.erase(handle); - return Status::OK(); -} - Status ProcessFunctionLibraryRuntime::ReleaseHandle( FunctionLibraryRuntime::Handle handle) { FunctionLibraryRuntime* flr = nullptr; diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h index c7b8259f787..05e57708993 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.h +++ b/tensorflow/core/common_runtime/process_function_library_runtime.h @@ -134,9 +134,6 @@ class ProcessFunctionLibraryRuntime { // of the device where the function is registered. string GetDeviceName(FunctionLibraryRuntime::Handle handle); - // Removes handle from the state owned by this object. - Status RemoveHandle(FunctionLibraryRuntime::Handle handle); - Status Clone(Env* env, int graph_def_version, const OptimizerOptions& optimizer_options, CustomKernelCreator custom_kernel_creator, diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc index 4fbf2abc671..cc10e77ad2e 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc @@ -119,12 +119,13 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test { EXPECT_GE(call_count, 1); // Test runner is used. - // Release the handle and then try running the function. It shouldn't - // succeed. + // Release the handle and then try running the function. It + // should still succeed. status = proc_flr_->ReleaseHandle(handle); if (!status.ok()) { return status; } + Notification done2; proc_flr_->Run(opts, handle, args, &out, [&status, &done2](const Status& s) { @@ -132,10 +133,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test { done2.Notify(); }); done2.WaitForNotification(); - EXPECT_TRUE(errors::IsNotFound(status)); - EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found.")); - - return Status::OK(); + return status; } std::vector devices_; From d481f07549470b4a03b41f9bb588d7f7ddc85082 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Sun, 22 Apr 2018 09:26:15 -0700 Subject: [PATCH 0582/1734] Remove proto header include in core/kernels. The goal is to make kernels mostly independent of proto headers, which will let us lock down our .so import PiperOrigin-RevId: 193843351 --- .../remote_fused_graph_execute_info.proto | 8 ---- tensorflow/core/kernels/BUILD | 1 + .../hexagon/hexagon_control_wrapper.cc | 1 + .../hexagon/hexagon_graph_execution_test.cc | 1 + .../kernels/i_remote_fused_graph_executor.h | 4 +- .../remote_fused_graph_execute_utils.cc | 46 +++++++++---------- .../remote_fused_graph_execute_utils.h | 28 +++++++---- .../remote_fused_graph_execute_utils_test.cc | 1 + ...ote_fused_graph_rewriter_transform_test.cc | 1 + tensorflow/core/kernels/summary_interface.h | 5 +- tensorflow/core/kernels/summary_kernels.cc | 1 + 11 files changed, 52 insertions(+), 45 deletions(-) diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto index 389a08ac2f3..946da40d0e3 100644 --- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto +++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto @@ -14,14 +14,6 @@ import "tensorflow/core/framework/types.proto"; // not valid across executions, but can be serialized back and forth from within // a single run. message RemoteFusedGraphExecuteInfo { - enum NodeType { - UNUSED = 0; - GRAPH_INPUT = 1; - GRAPH_OUTPUT = 2; - FUSED_NODE = 3; - BORDER_INPUT = 4; - BORDER_OUTPUT = 5; - } message TensorShapeTypeProto { DataType dtype = 1; diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 7ef15da143b..f7f6a9b505a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5925,6 +5925,7 @@ tf_cc_test( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", "//tensorflow/core:tensorflow", "//tensorflow/core:test", "//tensorflow/core:test_main", diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc index 66d24d171d1..3810cbe5b55 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h" #include "tensorflow/core/framework/graph_transfer_info.pb.h" +#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h" #include "tensorflow/core/kernels/hexagon/soc_interface.h" diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc index 5fb6b9247f0..d53977703e4 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc @@ -30,6 +30,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp #include #include "tensorflow/core/framework/graph_transfer_info.pb.h" +#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/kernels/hexagon/graph_transfer_utils.h" diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h index eb6b64da583..60724126892 100644 --- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h +++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h @@ -16,13 +16,15 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_ #define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_ -#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/platform/macros.h" namespace tensorflow { +class GraphDef; +class RemoteFusedGraphExecuteInfo; + class IRemoteFusedGraphExecutor { public: using TensorAllocatorFunc = std::function; diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc index e2709c117dc..cc4d9a49a00 100644 --- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc +++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc @@ -20,7 +20,9 @@ limitations under the License. #include #include "tensorflow/core/common_runtime/shape_refiner.h" +#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/framework/tensor.pb.h" #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/graph/algorithm.h" @@ -1125,46 +1127,43 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode( for (size_t i = 0; i < inputs.size(); ++i) { if (IsSameNodeName(node_def, inputs.at(i), &tid)) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_INPUT, - tid.second, i, remote_graph_executor_name, + attr_str += BuildNodeTypeAttr(GRAPH_INPUT, tid.second, i, + remote_graph_executor_name, remote_fused_graph_node_name); } } for (size_t i = 0; i < outputs.size(); ++i) { if (IsSameNodeName(node_def, outputs.at(i), &tid)) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT, - tid.second, i); + attr_str += BuildNodeTypeAttr(GRAPH_OUTPUT, tid.second, i); } } for (const string& fused_node_name : fused_node_names) { if (fused_node_name == node_def.name()) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE); + attr_str += BuildNodeTypeAttr(FUSED_NODE); } } for (const string& fused_node_name : fused_nodes_filtered_by_op_types) { if (fused_node_name == node_def.name()) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::FUSED_NODE); + attr_str += BuildNodeTypeAttr(FUSED_NODE); } } for (size_t i = 0; i < border_inputs.size(); ++i) { if (IsSameNodeName(node_def, border_inputs.at(i), &tid)) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::BORDER_INPUT, - tid.second, i); + attr_str += BuildNodeTypeAttr(BORDER_INPUT, tid.second, i); } } for (size_t i = 0; i < border_outputs.size(); ++i) { if (IsSameNodeName(node_def, border_outputs.at(i), &tid)) { AppendDeliminator(&attr_str); - attr_str += BuildNodeTypeAttr( - RemoteFusedGraphExecuteInfo::BORDER_OUTPUT, tid.second, i); + attr_str += BuildNodeTypeAttr(BORDER_OUTPUT, tid.second, i); } } if (attr_str.empty()) { - attr_str += BuildNodeTypeAttr(RemoteFusedGraphExecuteInfo::UNUSED); + attr_str += BuildNodeTypeAttr(UNUSED); } AddNodeAttr(ATTR_NODE_TYPE, attr_str, &node_def); } @@ -1200,14 +1199,14 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments( } int node_type_int; CHECK(strings::safe_strto32(attr.at(0), &node_type_int)) << attr.at(0); - const RemoteFusedGraphExecuteInfo::NodeType node_type = - static_cast(node_type_int); + const RemoteFusedGraphNodeType node_type = + static_cast(node_type_int); const string& name = node_def.name(); int port; int index; switch (node_type) { - case RemoteFusedGraphExecuteInfo::GRAPH_INPUT: + case GRAPH_INPUT: VLOG(2) << "Graph input: " << name; CHECK_EQ(5, attr.size()); CHECK(strings::safe_strto32(attr.at(1), &port)); @@ -1224,33 +1223,33 @@ RemoteFusedGraphExecuteUtils::FuseRemoteGraphByPlacedArguments( return Status::OK(); } break; - case RemoteFusedGraphExecuteInfo::GRAPH_OUTPUT: + case GRAPH_OUTPUT: VLOG(2) << "Graph output: " << name; CHECK_EQ(3, attr.size()); CHECK(strings::safe_strto32(attr.at(1), &port)); CHECK(strings::safe_strto32(attr.at(2), &index)); output_map.emplace(index, strings::StrCat(name, ":", port)); break; - case RemoteFusedGraphExecuteInfo::FUSED_NODE: + case FUSED_NODE: VLOG(2) << "Fused node: " << name; CHECK_EQ(1, attr.size()); fused_node_names.emplace(name); break; - case RemoteFusedGraphExecuteInfo::BORDER_INPUT: + case BORDER_INPUT: VLOG(2) << "Border input: " << name; CHECK_EQ(3, attr.size()); CHECK(strings::safe_strto32(attr.at(1), &port)); CHECK(strings::safe_strto32(attr.at(2), &index)); border_input_map.emplace(index, strings::StrCat(name, ":", port)); break; - case RemoteFusedGraphExecuteInfo::BORDER_OUTPUT: + case BORDER_OUTPUT: VLOG(2) << "Border output: " << name; CHECK_EQ(3, attr.size()); CHECK(strings::safe_strto32(attr.at(1), &port)); CHECK(strings::safe_strto32(attr.at(2), &index)); border_output_map.emplace(index, strings::StrCat(name, ":", port)); break; - case RemoteFusedGraphExecuteInfo::UNUSED: + case UNUSED: // do nothing break; default: @@ -1461,20 +1460,19 @@ RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions( } /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port, - const int index, const string& executor_name, const string& node_name) { + const RemoteFusedGraphNodeType node_type, const int port, const int index, + const string& executor_name, const string& node_name) { return strings::StrCat(static_cast(node_type), ",", port, ",", index, ",", executor_name, ",", node_name); } /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port, - const int index) { + const RemoteFusedGraphNodeType node_type, const int port, const int index) { return strings::StrCat(static_cast(node_type), ",", port, ",", index); } /* static */ string RemoteFusedGraphExecuteUtils::BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type) { + const RemoteFusedGraphNodeType node_type) { return strings::StrCat(static_cast(node_type)); } diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h index f0471442781..ea6b6a10154 100644 --- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h +++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h @@ -19,8 +19,6 @@ limitations under the License. #include #include -#include "tensorflow/core/framework/graph.pb.h" -#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h" @@ -30,6 +28,17 @@ limitations under the License. namespace tensorflow { +enum RemoteFusedGraphNodeType { + UNUSED = 0, + GRAPH_INPUT = 1, + GRAPH_OUTPUT = 2, + FUSED_NODE = 3, + BORDER_INPUT = 4, + BORDER_OUTPUT = 5, +}; + +class RemoteFusedGraphExecuteInfo; + // RemoteFusedGraphExecuteUtils provides APIs to register and get builder // functions for IRemoteFusedGraphExecutor. class RemoteFusedGraphExecuteUtils { @@ -297,16 +306,15 @@ class RemoteFusedGraphExecuteUtils { static ExecutorBuildRegistry* GetExecutorBuildRegistry(); - static string BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port, - const int index, const string& executor_name, const string& node_name); + static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type, + const int port, const int index, + const string& executor_name, + const string& node_name); - static string BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type, const int port, - const int index); + static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type, + const int port, const int index); - static string BuildNodeTypeAttr( - const RemoteFusedGraphExecuteInfo::NodeType node_type); + static string BuildNodeTypeAttr(const RemoteFusedGraphNodeType node_type); TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils); }; diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc index aca8ddfae9a..44251e6ff8e 100644 --- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc +++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/cc/framework/scope.h" #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc index 9217c25978c..1e0731e540c 100644 --- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc +++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/cc/ops/nn_ops.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/graph/default_device.h" diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h index 02391e967a8..1854fe55268 100644 --- a/tensorflow/core/kernels/summary_interface.h +++ b/tensorflow/core/kernels/summary_interface.h @@ -17,14 +17,15 @@ limitations under the License. #include -#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/event.pb.h" namespace tensorflow { +class Event; +class GraphDef; + // Main interface for the summary writer resource. class SummaryWriterInterface : public ResourceBase { public: diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc index d317a8d33db..b287f0cc2f1 100644 --- a/tensorflow/core/kernels/summary_kernels.cc +++ b/tensorflow/core/kernels/summary_kernels.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/lib/db/sqlite.h" #include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/util/event.pb.h" namespace tensorflow { From 21bd19a8b8b0be8ac4d39b6bc32366ba908f5105 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:49:13 +0000 Subject: [PATCH 0583/1734] Change from squeeze_dims to axis when calling tf.squeeze The `squeeze_dims` in `tf.squeeze` has been deprecated in favor of `axis` while many places still use `squeeze_dims`. That generates lots of warnings. This fix switches from `squeeze_dims` to `axis` to remove those warnings. Signed-off-by: Yong Tang --- tensorflow/python/ops/array_grad.py | 2 +- tensorflow/python/ops/array_ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py index 57d26578387..3678bd4c1f6 100644 --- a/tensorflow/python/ops/array_grad.py +++ b/tensorflow/python/ops/array_grad.py @@ -196,7 +196,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index): array_ops.where( math_ops.logical_and(grad.indices >= start, grad.indices < end)), - squeeze_dims=[1]) + axis=[1]) new_indices = array_ops.gather(grad.indices, indices_to_select) - start new_values = array_ops.gather(grad.values, indices_to_select) out_grads.append(ops.IndexedSlices(new_values, new_indices, size)) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 23202ae28e1..bbffff04831 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1230,7 +1230,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None): def _apply_mask_1d(reshaped_tensor, mask, axis=None): """Mask tensor along dimension 0 with a 1-D mask.""" - indices = squeeze(where(mask), squeeze_dims=[1]) + indices = squeeze(where(mask), axis=[1]) return gather(reshaped_tensor, indices, axis=axis) with ops.name_scope(name, values=[tensor, mask]): From 100b6000d4d04a344a1516578f724e46cdede5e1 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:52:31 +0000 Subject: [PATCH 0584/1734] Fix warning in image related ops. Signed-off-by: Yong Tang --- tensorflow/python/ops/image_ops_impl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 601010bce9e..bd5b2ae83b5 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -652,7 +652,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height, padded.set_shape(padded_shape) if not is_batch: - padded = array_ops.squeeze(padded, squeeze_dims=[0]) + padded = array_ops.squeeze(padded, axis=[0]) return padded @@ -732,7 +732,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height, cropped.set_shape(cropped_shape) if not is_batch: - cropped = array_ops.squeeze(cropped, squeeze_dims=[0]) + cropped = array_ops.squeeze(cropped, axis=[0]) return cropped @@ -849,7 +849,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width): resized = control_flow_ops.with_dependencies(assert_ops, resized) if not is_batch: - resized = array_ops.squeeze(resized, squeeze_dims=[0]) + resized = array_ops.squeeze(resized, axis=[0]) return resized @@ -942,7 +942,7 @@ def resize_images(images, for x in [new_width_const, width, new_height_const, height]) and ( width == new_width_const and height == new_height_const): if not is_batch: - images = array_ops.squeeze(images, squeeze_dims=[0]) + images = array_ops.squeeze(images, axis=[0]) return images if method == ResizeMethod.BILINEAR: @@ -965,7 +965,7 @@ def resize_images(images, images.set_shape([None, new_height_const, new_width_const, None]) if not is_batch: - images = array_ops.squeeze(images, squeeze_dims=[0]) + images = array_ops.squeeze(images, axis=[0]) return images From 8cdc752227af998da946decc9365d63bcaa7f184 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:53:10 +0000 Subject: [PATCH 0585/1734] Fix warning in tf.nn ops where squeeze_dims was used with tf.squeeze Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py index d0d5ed07ced..576627e78ed 100644 --- a/tensorflow/python/ops/nn_impl.py +++ b/tensorflow/python/ops/nn_impl.py @@ -765,9 +765,9 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False): weighted_variance = math_ops.multiply(weighted_distsq, divisor) if not keep_dims: - weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes) + weighted_mean = array_ops.squeeze(weighted_mean, axis=axes) weighted_variance = array_ops.squeeze( - weighted_variance, squeeze_dims=axes) + weighted_variance, axis=axes) if needs_cast: weighted_mean = math_ops.cast(weighted_mean, dtypes.float16) From 12fd64f72f59ff5ba114903d4b851f855aaf2458 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:53:58 +0000 Subject: [PATCH 0586/1734] Fix warnings in reduce_join_op_test.py Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/reduce_join_op_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py index 7f3049b9f84..fb9e5cc2a37 100644 --- a/tensorflow/python/kernel_tests/reduce_join_op_test.py +++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py @@ -160,7 +160,7 @@ class ReduceJoinTest(UnicodeTestCase): separator=separator) if not reduction_indices: truth = constant_op.constant(truth) - truth_squeezed = array_ops.squeeze(truth, squeeze_dims=reduction_indices) + truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices) output_array = output.eval() output_keep_dims_array = output_keep_dims.eval() truth_array = truth.eval() From 9aa142284166c51dfc202b551b4592f9c9ed54e7 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:54:26 +0000 Subject: [PATCH 0587/1734] Fix tf.contrib.timeseries warnings related to squeeze_dims Signed-off-by: Yong Tang --- .../timeseries/python/timeseries/state_management_test.py | 2 +- .../python/timeseries/state_space_models/kalman_filter.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py index d5dce30fda0..5f7e3da2db6 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py @@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel): batch_end_values = array_ops.squeeze( array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0], [-1, 1, -1]), - squeeze_dims=[1, 2]) + axis=[1, 2]) # A pretty odd but easy to think about loss: L1 loss on the batch end # values. loss = math_ops.reduce_sum( diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py index 1fcd3e391b6..a614386121e 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py @@ -170,7 +170,7 @@ class KalmanFilter(object): math_ops.matmul( transition_matrices, prior_state[..., None]), - squeeze_dims=[-1]) + axis=[-1]) return advanced_state def predict_state_var( @@ -254,7 +254,7 @@ class KalmanFilter(object): kalman_gain_transposed, array_ops.expand_dims(residual, -1), adjoint_a=True), - squeeze_dims=[-1]) + axis=[-1]) gain_obs = math_ops.matmul( kalman_gain_transposed, observation_model, adjoint_a=True) identity_extradim = linalg_ops.eye( @@ -332,7 +332,7 @@ class KalmanFilter(object): array_ops.expand_dims(state_mean, 1), observation_model, adjoint_b=True), - squeeze_dims=[1]) + axis=[1]) observed_var = math_ops.matmul( math_ops.matmul(observation_model, state_var), observation_model, From 8257b9096062a87555d72f7c15e16b1d8e748d70 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:55:06 +0000 Subject: [PATCH 0588/1734] Fix warnings in tf.contrib.tensor_forest Signed-off-by: Yong Tang --- tensorflow/contrib/tensor_forest/client/eval_metrics.py | 4 ++-- .../tensor_forest/hybrid/python/layers/fully_connected.py | 2 +- tensorflow/contrib/tensor_forest/python/tensor_forest.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py index 90033015ebc..e893e1d1c83 100644 --- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py +++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py @@ -37,7 +37,7 @@ def _top_k_generator(k): def _top_k(probabilities, targets): targets = math_ops.to_int32(targets) if targets.get_shape().ndims > 1: - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k)) return _top_k @@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None): def _squeeze_and_onehot(targets, depth): - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return array_ops.one_hot(math_ops.to_int32(targets), depth) diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py index ff3ab21eaa9..745a5b1caf2 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py +++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py @@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer): # There is always one activation per instance by definition, so squeeze # away the extra dimension. - return array_ops.squeeze(nn_activations, squeeze_dims=[1]) + return array_ops.squeeze(nn_activations, axis=[1]) class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer): diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py index b9bcbb170b0..7a35a70bbe3 100644 --- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py +++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py @@ -445,7 +445,7 @@ class RandomForestGraphs(object): mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) gather_indices = array_ops.squeeze( - array_ops.where(mask), squeeze_dims=[1]) + array_ops.where(mask), axis=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(processed_dense_features, gather_indices) From 685fec394235b409b58d7ef1c4a26655f9fedcfd Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:55:35 +0000 Subject: [PATCH 0589/1734] Fix squeeze_dims warnings in tf.contrib.learn Signed-off-by: Yong Tang --- tensorflow/contrib/learn/python/learn/estimators/head.py | 4 ++-- tensorflow/contrib/learn/python/learn/ops/losses_ops.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py index 2b4b6eff39f..e28e6854a50 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/head.py +++ b/tensorflow/contrib/learn/python/learn/estimators/head.py @@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead): key = prediction_key.PredictionKey.SCORES with ops.name_scope(None, "predictions", (logits,)): if self.logits_dimension == 1: - logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key) + logits = array_ops.squeeze(logits, axis=(1,), name=key) return {key: self._link_fn(logits)} def _metrics(self, eval_loss, predictions, labels, weights): @@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None): is_squeezed_labels = False # TODO(ptucker): This will break for dynamic shapes. if len(labels.get_shape()) == 2: - labels = array_ops.squeeze(labels, squeeze_dims=(1,)) + labels = array_ops.squeeze(labels, axis=(1,)) is_squeezed_labels = True loss = nn.sparse_softmax_cross_entropy_with_logits( diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py index 92976d1539c..9f2cadb0174 100644 --- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py +++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py @@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None): [tensor_in, labels]): predictions = nn.xw_plus_b(tensor_in, weights, biases) if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2: - predictions = array_ops_.squeeze(predictions, squeeze_dims=[1]) + predictions = array_ops_.squeeze(predictions, axis=[1]) return predictions, losses.mean_squared_error(labels, predictions) From 5c19fc7810f13712127b8527b040f8f656474fe5 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:56:09 +0000 Subject: [PATCH 0590/1734] Fix tf.contrib.layers warnings where squeeze_dims were used with tf.squeeze Signed-off-by: Yong Tang --- tensorflow/contrib/layers/python/layers/target_column.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py index 3e639a180ef..69bb6be8145 100644 --- a/tensorflow/contrib/layers/python/layers/target_column.py +++ b/tensorflow/contrib/layers/python/layers/target_column.py @@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn): def logits_to_predictions(self, logits, proba=False): if self.num_label_columns == 1: - return array_ops.squeeze(logits, squeeze_dims=[1]) + return array_ops.squeeze(logits, axis=[1]) return logits def get_eval_ops(self, features, logits, labels, metrics=None): @@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target): "Instead got %s." % target.dtype) # sparse_softmax_cross_entropy_with_logits requires [batch_size] target. if len(target.get_shape()) == 2: - target = array_ops.squeeze(target, squeeze_dims=[1]) + target = array_ops.squeeze(target, axis=[1]) loss_vec = nn.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logits) return loss_vec From 50a8df144d24ce60866bff96645f04e84a31f8b4 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:57:06 +0000 Subject: [PATCH 0591/1734] Fix warnings in tf.contrib.factorization Signed-off-by: Yong Tang --- tensorflow/contrib/factorization/python/ops/gmm_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py index ccdd679d6ae..e076631bc16 100644 --- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py +++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py @@ -397,7 +397,7 @@ class GmmAlgorithm(object): # Compute the effective number of data points assigned to component k. with ops.control_dependencies(self._w): points_in_k = array_ops.squeeze( - math_ops.add_n(self._points_in_k), squeeze_dims=[0]) + math_ops.add_n(self._points_in_k), axis=[0]) # Update alpha. if 'w' in self._params: final_points_in_k = points_in_k / num_batches From 82eacbd4ac29db754b86a0be0cdfcc65b467c6af Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 22 Apr 2018 17:57:31 +0000 Subject: [PATCH 0592/1734] Fix warnings in tf.contrib.distributions with squeeze_dims Signed-off-by: Yong Tang --- .../python/ops/bijectors/cholesky_outer_product.py | 2 +- tensorflow/contrib/distributions/python/ops/shape.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py index caae2adcfac..ecdb8967f43 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py @@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector): sum_weighted_log_diag = array_ops.squeeze( math_ops.matmul(math_ops.log(diag), exponents[..., array_ops.newaxis]), - squeeze_dims=-1) + axis=-1) fldj = p_float * np.log(2.) + sum_weighted_log_diag return fldj diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py index bac0b79d590..6a7f28713ac 100644 --- a/tensorflow/contrib/distributions/python/ops/shape.py +++ b/tensorflow/contrib/distributions/python/ops/shape.py @@ -439,7 +439,7 @@ class _DistributionShape(object): if self._batch_ndims_is_0 and expand_batch_dim: squeeze_dims += [1] if squeeze_dims: - x = array_ops.squeeze(x, squeeze_dims=squeeze_dims) + x = array_ops.squeeze(x, axis=squeeze_dims) # x.shape: [prod(S)]+B+E _, batch_shape, event_shape = self.get_shape(x) else: From ea0c8a7ed84eb5cdf8ca6a856f9bd05a95597739 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 22 Apr 2018 12:18:05 -0700 Subject: [PATCH 0593/1734] [StreamExecutor] [XLA] Delete copy/pasted implementations of MakeUnique. StreamExecutor and XLA had a copy/pasted implementation of MakeUnique, in namespaces stream_executor::port and xla. This change removes those implementations and instead pulls tensorflow::MakeUnique into namespace stream_executor and namespace xla. We pull it into stream_executor rather than stream_executor::port for consistency with TF and XLA, which both pull MakeUnique into their own namespace. This change also moves MakeUnique and WrapUnique out of namespace tensorflow::scam_ops::internal -- scam can simply use tensorflow::{Make,Wrap}Unique. I suspect the reason it was this way originally was that TF didn't have Make/WrapUnique. PiperOrigin-RevId: 193849330 --- tensorflow/compiler/xla/ptr_util.h | 22 +--------- .../xla/service/interpreter/platform.cc | 4 +- tensorflow/stream_executor/BUILD | 2 + .../stream_executor/cuda/cuda_platform.cc | 4 +- .../stream_executor/host/host_platform.cc | 4 +- tensorflow/stream_executor/lib/ptr_util.h | 42 ++----------------- 6 files changed, 13 insertions(+), 65 deletions(-) diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h index c58c19db2ca..bfcdfc62f95 100644 --- a/tensorflow/compiler/xla/ptr_util.h +++ b/tensorflow/compiler/xla/ptr_util.h @@ -28,26 +28,8 @@ limitations under the License. #include "tensorflow/core/util/ptr_util.h" namespace xla { - -template -std::unique_ptr WrapUnique(T* ptr) { - return tensorflow::WrapUnique(ptr); -} - -template -typename tensorflow::helper::MakeUniqueResult::scalar MakeUnique( - Args&&... args) { - return tensorflow::MakeUnique(std::forward(args)...); -} - -// Overload for array of unknown bound. -// The allocation of arrays needs to use the array form of new, -// and cannot take element constructor arguments. -template -typename tensorflow::helper::MakeUniqueResult::array MakeUnique(size_t n) { - return tensorflow::MakeUnique(n); -} - +using tensorflow::MakeUnique; +using tensorflow::WrapUnique; } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index ce2f4d378c0..92e069a8c67 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -71,8 +71,8 @@ port::StatusOr XlaInterpreterPlatform::GetExecutor( port::StatusOr> XlaInterpreterPlatform::GetUncachedExecutor( const StreamExecutorConfig& config) { - auto executor = port::MakeUnique( - this, port::MakeUnique(config.plugin_config)); + auto executor = MakeUnique( + this, MakeUnique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD index 80fc9ff2926..c68cda01002 100644 --- a/tensorflow/stream_executor/BUILD +++ b/tensorflow/stream_executor/BUILD @@ -35,6 +35,7 @@ cc_library( deps = [ "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", + "//tensorflow/core:ptr_util", "@local_config_cuda//cuda:cuda_headers", ], alwayslink = 1, @@ -46,6 +47,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:lib", + "//tensorflow/core:ptr_util", "//tensorflow/compiler/xla:statusor", "@local_config_cuda//cuda:cuda_headers", ] + if_static([":stream_executor_impl"]), diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc index 7a6ef5a248f..649224a20e9 100644 --- a/tensorflow/stream_executor/cuda/cuda_platform.cc +++ b/tensorflow/stream_executor/cuda/cuda_platform.cc @@ -168,8 +168,8 @@ port::StatusOr CudaPlatform::GetExecutor( port::StatusOr> CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { - auto executor = port::MakeUnique( - this, port::MakeUnique(config.plugin_config)); + auto executor = MakeUnique( + this, MakeUnique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc index 00a17a05ede..a652b08b4fc 100644 --- a/tensorflow/stream_executor/host/host_platform.cc +++ b/tensorflow/stream_executor/host/host_platform.cc @@ -66,8 +66,8 @@ port::StatusOr HostPlatform::GetExecutor( port::StatusOr> HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) { - auto executor = port::MakeUnique( - this, port::MakeUnique(config.plugin_config)); + auto executor = MakeUnique( + this, MakeUnique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ diff --git a/tensorflow/stream_executor/lib/ptr_util.h b/tensorflow/stream_executor/lib/ptr_util.h index 3f89794688c..8f9f420fec7 100644 --- a/tensorflow/stream_executor/lib/ptr_util.h +++ b/tensorflow/stream_executor/lib/ptr_util.h @@ -17,47 +17,11 @@ limitations under the License. #define TENSORFLOW_STREAM_EXECUTOR_LIB_PTR_UTIL_H_ #include +#include "tensorflow/core/util/ptr_util.h" namespace stream_executor { -namespace port { - -// Trait to select overloads and return types for MakeUnique. -template -struct MakeUniqueResult { - using scalar = std::unique_ptr; -}; -template -struct MakeUniqueResult { - using array = std::unique_ptr; -}; -template -struct MakeUniqueResult { - using invalid = void; -}; - -// MakeUnique(...) is an early implementation of C++14 std::make_unique. -// It is designed to be 100% compatible with std::make_unique so that the -// eventual switchover will be a simple renaming operation. -template -typename MakeUniqueResult::scalar MakeUnique(Args&&... args) { // NOLINT - return std::unique_ptr( - new T(std::forward(args)...)); // NOLINT(build/c++11) -} - -// Overload for array of unknown bound. -// The allocation of arrays needs to use the array form of new, -// and cannot take element constructor arguments. -template -typename MakeUniqueResult::array MakeUnique(size_t n) { - return std::unique_ptr(new typename std::remove_extent::type[n]()); -} - -// Reject arrays of known bound. -template -typename MakeUniqueResult::invalid MakeUnique(Args&&... /* args */) = - delete; // NOLINT - -} // namespace port +using tensorflow::MakeUnique; +using tensorflow::WrapUnique; } // namespace stream_executor namespace perftools { From 56fd856425f1322d22796decb1f0580c8fab5d5a Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 22 Apr 2018 14:48:05 -0700 Subject: [PATCH 0594/1734] [XLA] Make Executable return a ScopedShapedBuffer. Previously, we returned a plain ShapedBuffer. But this doesn't capture our semantics: It's up to the callee to free this ShapedBuffer. PiperOrigin-RevId: 193854051 --- .../compiler/xla/client/local_client.cc | 12 ++--- .../xla/service/allocation_tracker.cc | 45 ++++++++++++------- .../compiler/xla/service/allocation_tracker.h | 32 ++++++++----- .../xla/service/cpu/cpu_executable.cc | 14 +++--- .../compiler/xla/service/cpu/cpu_executable.h | 8 ++-- .../service/cpu/parallel_cpu_executable.cc | 10 ++--- .../xla/service/cpu/parallel_cpu_executable.h | 4 +- tensorflow/compiler/xla/service/executable.cc | 8 ++-- tensorflow/compiler/xla/service/executable.h | 8 ++-- .../xla/service/gpu/gpu_executable.cc | 8 ++-- .../compiler/xla/service/gpu/gpu_executable.h | 4 +- tensorflow/compiler/xla/service/hlo_runner.cc | 14 ++---- .../xla/service/interpreter/executable.cc | 8 ++-- .../xla/service/interpreter/executable.h | 4 +- tensorflow/compiler/xla/service/service.cc | 14 +++--- .../compiler/xla/service/shaped_buffer.cc | 4 +- .../compiler/xla/service/shaped_buffer.h | 6 +++ .../compiler/xla/service/transfer_manager.cc | 15 ++----- .../compiler/xla/service/transfer_manager.h | 5 +-- tensorflow/compiler/xla/tests/fusion_test.cc | 6 +-- 20 files changed, 119 insertions(+), 110 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index d0e945b70fd..1c127059037 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -166,12 +166,8 @@ StatusOr LocalExecutable::Run( if (executable_->dumping()) { return ExecuteAndDump(&service_options, arguments); } - TF_ASSIGN_OR_RETURN( - ShapedBuffer result, - executable_->ExecuteOnStreamWrapper( - &service_options, run_options.execution_profile(), arguments)); - - return ScopedShapedBuffer(std::move(result), run_options.allocator()); + return executable_->ExecuteOnStreamWrapper( + &service_options, run_options.execution_profile(), arguments); } StatusOr LocalExecutable::ExecuteAndDump( @@ -181,12 +177,12 @@ StatusOr LocalExecutable::ExecuteAndDump( backend_->platform()->Name()); TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module())); TF_ASSIGN_OR_RETURN( - ShapedBuffer result, + ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module())); TF_RETURN_IF_ERROR(executable_->DumpSessionModule()); - return ScopedShapedBuffer(std::move(result), run_options->allocator()); + return std::move(result); } tensorflow::Status LocalExecutable::RecordArguments( diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 6bf65825cd0..cf1231bcce4 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -31,23 +31,35 @@ limitations under the License. namespace xla { StatusOr AllocationTracker::Register( - ShapedBuffer shaped_buffer, const string& tag) { + ScopedShapedBuffer shaped_buffer, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "Register"; - std::vector replicated_buffers; + std::vector replicated_buffers; replicated_buffers.emplace_back(std::move(shaped_buffer)); return RegisterInternal(std::move(replicated_buffers), tag); } StatusOr AllocationTracker::RegisterReplicatedBuffers( - std::vector replicated_buffers, const string& tag) { + std::vector replicated_buffers, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "RegisterReplicatedBuffers"; return RegisterInternal(std::move(replicated_buffers), tag); } +// ReleaseIfScopedShapedBuffer lets RegisterInternal(b) call +// b.release() if b is a ScopedShapedBuffer, or otherwise pass b through +// unmodified. +static ShapedBuffer ReleaseIfScopedShapedBuffer(ShapedBuffer b) { return b; } +static ShapedBuffer ReleaseIfScopedShapedBuffer(ScopedShapedBuffer b) { + return b.release(); +} + +template StatusOr AllocationTracker::RegisterInternal( - std::vector replicated_buffers, const string& tag) { + std::vector replicated_buffers, const string& tag) { + static_assert(std::is_same::value || + std::is_same::value, + "ShapedBufferTy must be ShapedBuffer or ScopedShapedBuffer."); VLOG(2) << "RegisterInternal(" << "tag: \"" << tag << "\" with " << replicated_buffers.size() << " shaped_buffers."; @@ -65,17 +77,22 @@ StatusOr AllocationTracker::RegisterInternal( int64 handle = next_handle_++; for (auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(), - [this, &shape_indices](const Shape& /*subshape*/, - const ShapeIndex& index) { - shape_indices.push_back(index); - }); + ShapeUtil::ForEachSubshape( + shaped_buffer.on_device_shape(), + [&](const Shape& /*subshape*/, const ShapeIndex& index) { + shape_indices.push_back(index); + }); + // Add shaped_buffer's buffers to opaque_to_allocation_map_, which owns + // them. for (const ShapeIndex& index : shape_indices) { AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index), shaped_buffer.device_ordinal()); } - handle_to_shaped_buffers_[handle].emplace_back( - MakeUnique(std::move(shaped_buffer))); + // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer + // into a regular ShapedBuffer, which is stored in + // handle_to_shaped_buffers_. + handle_to_shaped_buffers_[handle].emplace_back(MakeUnique( + ReleaseIfScopedShapedBuffer(std::move(shaped_buffer)))); } GlobalDataHandle result; @@ -102,10 +119,6 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) { shaped_buffer->device_ordinal())); } } - return Reset(data); -} - -Status AllocationTracker::Reset(const GlobalDataHandle& data) { // Keep a nullptr as a tombstone for unregistered handles. This enables // better error messages. That is, "handle has been deallocated" versus // "handle does not exist". @@ -152,7 +165,7 @@ StatusOr> AllocationTracker::DeconstructTuple( element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}), /*index=*/{}); std::vector replicated_buffers; - replicated_buffers.emplace_back(std::move(element_buffer)); + replicated_buffers.push_back(std::move(element_buffer)); TF_ASSIGN_OR_RETURN( GlobalDataHandle element_handle, RegisterInternal(std::move(replicated_buffers), "deconstructed tuple")); diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 2bfcd537129..1174fa641c0 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -45,13 +45,13 @@ class AllocationTracker { // Registers a shaped buffer of device memory, and returns a corresponding // handle that can be used for talking to XLA clients. The given shaped buffer // will be treated as the buffer corresponding to the only replica. - StatusOr Register(ShapedBuffer shaped_buffer, + StatusOr Register(ScopedShapedBuffer shaped_buffer, const string& tag); // Registers a vector of shaped buffers of device memory, one per replica, and // returns a corresponding handle that can be used for talking to XLA clients. StatusOr RegisterReplicatedBuffers( - std::vector replicated_buffers, const string& tag); + std::vector replicated_buffers, const string& tag); // Unregister the allocation for the given data handle. Status Unregister(const GlobalDataHandle& data); @@ -87,21 +87,21 @@ class AllocationTracker { }; // Internal helper which resolves the given GlobalDataHandle to a - // ShapedBuffer. + // list of ScopedShapedBuffers. StatusOr> ResolveInternal( const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Internal helper which registers a vector of shaped buffers, one per - // replica. + // replica. ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer. If + // it's ShapedBuffer, all of the given buffers must already be tracked by this + // object -- presumably this is a call from DeconstructTuple. + template StatusOr RegisterInternal( - std::vector replicated_buffers, const string& tag) + std::vector replicated_buffers, const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Resets the shaped buffers corresponding to the given handle. - Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Adds the given device address to the allocation tracker, or if it already - // exists, then increment it's reference count. + // exists, then increment its reference count. void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory, int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -133,7 +133,19 @@ class AllocationTracker { // buffers for different replicas. // // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our - // public API returns pointers to them. + // public API returns pointers to them. We expect the concrete class to be + // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is + // handled by opaque_to_allocation_map_. + // + // The elements of the vectors need to be unique_ptrs because we return + // pointers to them. (In theory we could use std::list or something instead, + // but we also want to be able to null out these elements.) + // + // The reason that the elements can't be unique_ptrs is + // the existence of DeconstructTuple(). This function allows us to create a + // non-owning "view" into a tuple's sub-buffers. The sub-buffers are then + // free'd when both the view *and* the original tuple are Unregistered. This + // refcounting is managed in opaque_to_allocation_map_. tensorflow::gtl::FlatMap>> handle_to_shaped_buffers_ GUARDED_BY(mutex_); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 97e550abe44..aabf4d5161e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -243,14 +243,14 @@ static Status DeallocateTempBuffers( return Status::OK(); } -StatusOr CpuExecutable::CreateResultShapedBuffer( +StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result) { se::Stream* stream = run_options->stream(); - ShapedBuffer result_buffer( + ScopedShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); + run_options->allocator(), stream->parent()->device_ordinal()); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer which is returned to the caller. @@ -281,7 +281,7 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteOnStream( +StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -300,7 +300,7 @@ StatusOr CpuExecutable::ExecuteOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - ShapedBuffer result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); // Free all buffers not in the result. @@ -310,7 +310,7 @@ StatusOr CpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteAsyncOnStream( +StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { if (hlo_profiling_enabled()) { @@ -330,7 +330,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - ShapedBuffer result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); LogLiveAddresses(buffers, buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 06b6943cb5a..68ad38cba88 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,12 +55,12 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; @@ -102,13 +102,13 @@ class CpuExecutable : public Executable { tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); - // Creates a ShapedBuffer for holding the result of the computation. The + // Creates a ScopedShapedBuffer for holding the result of the computation. The // addresses (DeviceMemoryBases) are set according to buffer assignment. // 'buffers_in_result' should point to a vector of the same size as // 'allocated_buffers'. An element in buffers_in_result is set to true if the // corresponding buffer is live out of the computation (and thus contained in // the returned ShapedBuffer). - StatusOr CreateResultShapedBuffer( + StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index a2bd4fa195b..035f9ddb2e2 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -447,7 +447,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions( return Status::OK(); } -StatusOr ParallelCpuExecutable::ExecuteOnStream( +StatusOr ParallelCpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -459,9 +459,9 @@ StatusOr ParallelCpuExecutable::ExecuteOnStream( DeviceMemoryAllocator* memory_allocator = run_options->allocator(); std::vector buffers(assignment_->Allocations().size()); - ShapedBuffer result_buffer( + ScopedShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); + run_options->allocator(), stream->parent()->device_ordinal()); TF_RETURN_IF_ERROR(AllocateBuffers( memory_allocator, stream->parent()->device_ordinal(), &buffers)); @@ -470,7 +470,7 @@ StatusOr ParallelCpuExecutable::ExecuteOnStream( hlo_execution_profile)); // Copy DeviceMemoryBase values which into the respective location in - // ShapedBuffer which is returned to the caller. + // the ScopedShapedBuffer which is returned to the caller. std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { @@ -511,7 +511,7 @@ StatusOr ParallelCpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr ParallelCpuExecutable::ExecuteAsyncOnStream( +StatusOr ParallelCpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h index 5ce84fa9964..55f8331b597 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h @@ -59,12 +59,12 @@ class ParallelCpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~ParallelCpuExecutable() override {} - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index be19b3ff04c..021f09d310b 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -29,12 +29,12 @@ using tensorflow::gtl::ArraySlice; namespace xla { -StatusOr> Executable::ExecuteOnStreams( +StatusOr> Executable::ExecuteOnStreams( ArraySlice run_options, ArraySlice> arguments) { TF_RET_CHECK(run_options.size() == arguments.size()); - std::vector return_values; + std::vector return_values; return_values.reserve(run_options.size()); if (run_options.size() == 1) { @@ -60,7 +60,7 @@ StatusOr> Executable::ExecuteOnStreams( return std::move(return_values); } -StatusOr Executable::ExecuteOnStreamWrapper( +StatusOr Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, ArraySlice arguments) { se::Stream* stream = run_options->stream(); @@ -80,7 +80,7 @@ StatusOr Executable::ExecuteOnStreamWrapper( &hlo_profile_index_map()) : nullptr; - StatusOr return_value = + StatusOr return_value = ExecuteOnStream(run_options, arguments, profile_ptr.get()); TF_RETURN_IF_ERROR(return_value.status()); diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 0c95f1a3611..f7af1ca5749 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -63,14 +63,14 @@ class Executable { // enabled. // // Returns a shaped buffer containing the result of the computation. - virtual StatusOr ExecuteOnStream( + virtual StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteOnStream(), but this call is non-blocking and returns as // soon as all of the operations are enqueued for launch on the stream. - virtual StatusOr ExecuteAsyncOnStream( + virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) = 0; @@ -78,7 +78,7 @@ class Executable { // streams. arguments[i] contains the arguments to the execution on // run_options[i]->stream() and the returned value is at index i of the // returned vector. - virtual StatusOr> ExecuteOnStreams( + virtual StatusOr> ExecuteOnStreams( tensorflow::gtl::ArraySlice run_options, tensorflow::gtl::ArraySlice< @@ -98,7 +98,7 @@ class Executable { // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a // timer for the execution, sets up HLO profiling if enabled, and fills in the // given ExecutionProfile if non-null. - StatusOr ExecuteOnStreamWrapper( + StatusOr ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, tensorflow::gtl::ArraySlice arguments); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 62ce15bc59d..980cc89fa03 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } -StatusOr GpuExecutable::ExecuteOnStream( +StatusOr GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -297,8 +297,8 @@ StatusOr GpuExecutable::ExecuteOnStream( HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); - auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(), - executor->platform(), device_ordinal); + ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(), + memory_allocator, device_ordinal); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer. @@ -335,7 +335,7 @@ StatusOr GpuExecutable::ExecuteOnStream( return std::move(shaped_buffer); } -StatusOr GpuExecutable::ExecuteAsyncOnStream( +StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 361bc30b2f3..80ec38c3ac1 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -74,12 +74,12 @@ class GpuExecutable : public Executable { // ExecuteOnStream will fail if the compute capability of the stream doesn't // match the compute capability passed to this object's constructor. - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index df5ffd0b7d6..81c43db292a 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -126,16 +126,12 @@ StatusOr> HloRunner::Execute( } TF_ASSIGN_OR_RETURN( - ShapedBuffer result, + ScopedShapedBuffer result, executable->ExecuteOnStreamWrapper( &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs)); - // Create a ScopedShapedBuffer of the result to manage deallocation. This will - // deallocate all the device memory when it goes out of scope. - ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator()); - auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice( - stream.parent(), scoped_result); + stream.parent(), result); if (result_literal.ok()) { VLOG(4) << "Executed binary and got result: " << result_literal.ValueOrDie()->ToString(); @@ -248,18 +244,16 @@ StatusOr>> HloRunner::ExecuteReplicated( } LOG(INFO) << "Replicated execution started"; - TF_ASSIGN_OR_RETURN(std::vector results, + TF_ASSIGN_OR_RETURN(std::vector results, executable->ExecuteOnStreams(service_run_options, argument_buffer_slices)); LOG(INFO) << "Replicated execution terminated"; std::vector> exec_results; for (int64 i = 0; i < options.num_replicas; ++i) { - ScopedShapedBuffer result(std::move(results[i]), - backend().memory_allocator()); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, backend().transfer_manager()->TransferLiteralFromDevice( - streams[i]->parent(), result)); + streams[i]->parent(), results[i])); exec_results.push_back(std::move(literal)); } return std::move(exec_results); diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 6553000336b..61f199bc9e8 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable( InterpreterExecutable::~InterpreterExecutable() {} -StatusOr InterpreterExecutable::ExecuteOnStream( +StatusOr InterpreterExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -88,8 +88,8 @@ StatusOr InterpreterExecutable::ExecuteOnStream( evaluator.Evaluate>(*computation, arg_literals)); // Transform the result literal back into a ShapedBuffer. - TF_ASSIGN_OR_RETURN(ShapedBuffer result, - transfer_manager->AllocateShapedBuffer( + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, + transfer_manager->AllocateScopedShapedBuffer( result_literal->shape(), run_options->allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice( @@ -106,7 +106,7 @@ StatusOr InterpreterExecutable::ExecuteOnStream( return std::move(result); } -StatusOr InterpreterExecutable::ExecuteAsyncOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { return tensorflow::errors::Unimplemented( diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index c825a9a368d..b0b797ca7d6 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable { InterpreterExecutable(std::unique_ptr hlo_module); ~InterpreterExecutable() override; - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index a73118c68a7..e8403c9e952 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -553,7 +553,7 @@ Service::ExecuteParallelAndRegisterResult( // Stream executors for the replicas of the current computation. TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i])); CHECK_EQ(replicas.size(), arguments[i].size()); - std::vector result_buffers; + std::vector result_buffers; for (int64 replica = 0; replica < replicas.size(); ++replica) { TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, backend->BorrowStream(replicas[replica])); @@ -585,7 +585,7 @@ Service::ExecuteParallelAndRegisterResult( backend->StreamBorrower()); // Asynchronously launch the computation. - TF_ASSIGN_OR_RETURN(ShapedBuffer result, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, executables[i]->ExecuteAsyncOnStream( &run_options, arguments[i][replica])); @@ -1237,7 +1237,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, streams.push_back(std::move(stream)); } - std::vector result_buffers; + std::vector result_buffers; for (size_t i = 0; i < streams.size(); ++i) { const auto& stream = streams[i]; ExecutableRunOptions options; @@ -1250,7 +1250,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, ServiceExecutableRunOptions service_options( options, execute_backend_->StreamBorrower()); - TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer, executable->ExecuteAsyncOnStream( &service_options, replicated_arguments[i])); @@ -1350,11 +1350,11 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg, } // Allocate memory in each replica and transfer the data to all replicas. - std::vector replicated_buffers; + std::vector replicated_buffers; for (se::StreamExecutor* executor : replicas) { TF_ASSIGN_OR_RETURN( - ShapedBuffer shaped_buffer, - execute_backend_->transfer_manager()->AllocateShapedBuffer( + ScopedShapedBuffer shaped_buffer, + execute_backend_->transfer_manager()->AllocateScopedShapedBuffer( shape, execute_backend_->memory_allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR( diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 0b5a383f6fe..fb3b5f06dad 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -117,7 +117,7 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer, : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {} ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s) - : ShapedBuffer(std::move(s)), allocator_(s.allocator_) { + : ShapedBuffer(static_cast(s)), allocator_(s.allocator_) { // Null out s.allocator_ so it doesn't try to free anything in its destructor. s.allocator_ = nullptr; } @@ -151,7 +151,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() { } ShapedBuffer ScopedShapedBuffer::release() { - ShapedBuffer shaped_buffer(std::move(*this)); + ShapedBuffer shaped_buffer(static_cast(*this)); buffers_ = ShapeTree(); return shaped_buffer; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index f1b0527474c..e10fca9e946 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -30,6 +30,8 @@ limitations under the License. namespace xla { +class ScopedShapedBuffer; + // Class which encapsulates a buffer or set of buffers containing data of a // particular XLA shape. class ShapedBuffer { @@ -49,6 +51,10 @@ class ShapedBuffer { ShapedBuffer(const ShapedBuffer&) = delete; ShapedBuffer& operator=(const ShapedBuffer&) = delete; + // Prevent (some forms of) accidental object slicing. + ShapedBuffer(const ScopedShapedBuffer&) = delete; + ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete; + virtual ~ShapedBuffer(); // Returns the shape of the on-host representation of the data held by this diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 98d0111d04d..8b71a415091 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice( return Status::OK(); } -StatusOr TransferManager::AllocateShapedBuffer( +StatusOr TransferManager::AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { @@ -187,8 +187,8 @@ StatusOr TransferManager::AllocateShapedBuffer( const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape)); - ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, - allocator->platform(), device_ordinal); + ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape, allocator, + device_ordinal); // Allocate an appropriate sized buffer for each element in the shape // including the tuple pointer arrays. @@ -204,13 +204,4 @@ StatusOr TransferManager::AllocateShapedBuffer( return std::move(shaped_buffer); } -StatusOr TransferManager::AllocateScopedShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, - int device_ordinal) { - TF_ASSIGN_OR_RETURN( - ShapedBuffer unscoped_buffer, - AllocateShapedBuffer(on_host_shape, allocator, device_ordinal)); - return ScopedShapedBuffer(std::move(unscoped_buffer), allocator); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index a6451c4bb11..d82b4f0f81b 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -104,12 +104,9 @@ class TransferManager { // region for a host-to-device transfer. virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0; - // Allocate a ShapedBuffer which can hold data with the given on-host + // Allocates a ScopedShapedBuffer which can hold data with the given on-host // shape. The on-device shape may be different as indicated by // HostShapeToDeviceShape. - StatusOr AllocateShapedBuffer(const Shape& on_host_shape, - DeviceMemoryAllocator* allocator, - int device_ordinal); StatusOr AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal); diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index c7f64d85609..6f89e9164c8 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) { // Transfer literals to device. auto param0_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); - ShapedBuffer buffer0 = + ScopedShapedBuffer buffer0 = client->LiteralToShapedBuffer(*param0_literal, device_ordinal) .ConsumeValueOrDie(); auto param1_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); - ShapedBuffer buffer1 = + ScopedShapedBuffer buffer1 = client->LiteralToShapedBuffer(*param1_literal, device_ordinal) .ConsumeValueOrDie(); auto param2_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); - ShapedBuffer buffer2 = + ScopedShapedBuffer buffer2 = client->LiteralToShapedBuffer(*param2_literal, device_ordinal) .ConsumeValueOrDie(); From c1544d1c34dac9aa01ed2de84bc850f8d1bfe919 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Sun, 22 Apr 2018 19:08:21 -0700 Subject: [PATCH 0595/1734] Update tuple for cuda version with auto as it was removed in #18434. --- tensorflow/core/kernels/conv_ops_gpu.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index 7f9cfec981f..bbd5a53660f 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -143,8 +143,7 @@ class ConvParameters { bool ShouldIncludeWinogradNonfusedAlgo( perftools::gputools::StreamExecutor* stream_exec) const { // Skip this check for cuDNN 7 and newer. - perftools::gputools::port::StatusOr> version = - stream_exec->AsDnn()->GetVersion(); + auto version = stream_exec->AsDnn()->GetVersion(); if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) { return true; } From e5cfbd0eceb4dca98b388b13acff499a5420f863 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Sun, 22 Apr 2018 20:00:54 -0700 Subject: [PATCH 0596/1734] Fix more for cuda version check. --- tensorflow/core/kernels/conv_ops_gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index bbd5a53660f..e8da5298e68 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -144,7 +144,7 @@ class ConvParameters { perftools::gputools::StreamExecutor* stream_exec) const { // Skip this check for cuDNN 7 and newer. auto version = stream_exec->AsDnn()->GetVersion(); - if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) { + if (version.ok() && version.ValueOrDie().major_version() >= 7) { return true; } return ShouldIncludeWinogradNonfusedAlgoPreCudnn7(); From 734636640534cd9478a7465c3975031a089629ea Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 22 Apr 2018 22:04:22 -0700 Subject: [PATCH 0597/1734] Rm references to SubmodelPort PiperOrigin-RevId: 193873101 --- tensorflow/contrib/optimizer_v2/optimizer_v2.py | 15 --------------- tensorflow/python/training/optimizer.py | 15 --------------- 2 files changed, 30 deletions(-) diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py index 25d19578ea8..dcb5bb6416a 100644 --- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py +++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py @@ -125,19 +125,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable): return update_op -class _StreamingModelPortProcessor(_OptimizableVariable): - """Processor for streaming ModelPorts.""" - - def __init__(self, v): - self._v = v - - def target(self): - return self._v - - def update_op(self, optimizer, g, *args): - return g - - class _TensorProcessor(_OptimizableVariable): """Processor for ordinary Tensors. @@ -167,8 +154,6 @@ def _get_processor(v): return _DenseResourceVariableProcessor(v) if isinstance(v, variables.Variable): return _RefVariableProcessor(v) - if v.op.type == "SubmodelPort": - return _StreamingModelPortProcessor(v) if isinstance(v, ops.Tensor): return _TensorProcessor(v) raise NotImplementedError("Trying to optimize unsupported type ", v) diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index f126d3847b6..66914bacf35 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -170,19 +170,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable): return update_op -class _StreamingModelPortProcessor(_OptimizableVariable): - """Processor for streaming ModelPorts.""" - - def __init__(self, v): - self._v = v - - def target(self): - return self._v - - def update_op(self, optimizer, g): - return g - - class _TensorProcessor(_OptimizableVariable): """Processor for ordinary Tensors. @@ -216,8 +203,6 @@ def _get_processor(v): return _DenseResourceVariableProcessor(v) if isinstance(v, variables.Variable): return _RefVariableProcessor(v) - if v.op.type == "SubmodelPort": - return _StreamingModelPortProcessor(v) if isinstance(v, ops.Tensor): return _TensorProcessor(v) raise NotImplementedError("Trying to optimize unsupported type ", v) From 97bc1d90b385d06400376ceba8a924f4982c0434 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 22 Apr 2018 22:17:13 -0700 Subject: [PATCH 0598/1734] Init struct bools to false to prevent warnings by dynamic type checking programs when an uninitialized value is read by operator=. PiperOrigin-RevId: 193873776 --- tensorflow/core/framework/collective.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h index 40d82ab0e97..0943b85fba9 100644 --- a/tensorflow/core/framework/collective.h +++ b/tensorflow/core/framework/collective.h @@ -80,7 +80,7 @@ struct CollInstanceParams { // Task name prefix of corresponding device name. std::vector task_names; // True if every task has the same number of devices. - bool same_num_devices_per_task; + bool same_num_devices_per_task = false; CollImplDetails impl_details; string ToString() const; CollInstanceParams& operator=(const struct CollInstanceParams& other); @@ -99,9 +99,9 @@ struct CollectiveParams { CollInstanceParams instance; CollTaskParams task; - string name; // node name used only for log or error messages - int default_rank; // index of this op within device_names - bool is_source; // broadcast only + string name; // node name used only for log or error messages + int default_rank; // index of this op within device_names + bool is_source = false; // broadcast only // Rank of this device in each subdivision permutation. std::vector subdiv_rank; std::unique_ptr merge_op; // reduction only From 6d57bca02b3278e812658fe5514a2bcb17670dbe Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 02:53:01 -0700 Subject: [PATCH 0599/1734] Fix dilated bound calculation in window util for size 0 Previusly the logic calculated incorrect bounds for the case where the base bond is 0 causing issues with 0 sized base dilated convolutions. PiperOrigin-RevId: 193896380 --- tensorflow/compiler/xla/window_util.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc index 93284b80f9e..f11123ca248 100644 --- a/tensorflow/compiler/xla/window_util.cc +++ b/tensorflow/compiler/xla/window_util.cc @@ -199,6 +199,9 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) { int64 DilatedBound(int64 bound, int64 dilation) { CHECK_GE(bound, 0); CHECK_GE(dilation, 1); + if (bound == 0) { + return 0; + } // Suppose the array has three entries 123 and the dilation factor is 4. Then // the dilated array has 9 entries 1xxx2xxx3. Here, each original entry except @@ -212,7 +215,7 @@ int64 StridedBound(int64 bound, int64 window_size, int64 stride) { CHECK_GE(bound, 0); CHECK_GE(stride, 1); - if (window_size > bound) { + if (bound == 0 || window_size > bound) { return 0; } From a821ea02afd05a96dd0e118e6ee745d472c61b3e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 06:55:23 -0700 Subject: [PATCH 0600/1734] Support non-equal set sizes for FID computation. PiperOrigin-RevId: 193917167 --- .../eval/python/classifier_metrics_impl.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 47e51415fd9..d914f549457 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -488,25 +488,25 @@ def frechet_classifier_distance(real_images, The Frechet Inception distance. A floating-point scalar of the same type as the output of `classifier_fn`. """ - real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) - imgs = array_ops.stack(real_images_list + generated_images_list) + real_imgs = array_ops.stack(real_images_list) + generated_imgs = array_ops.stack(generated_images_list) # Compute the activations using the memory-efficient `map_fn`. - activations = functional_ops.map_fn( - fn=classifier_fn, - elems=imgs, - parallel_iterations=1, - back_prop=False, - swap_memory=True, - name='RunClassifier') + def compute_activations(elems): + return functional_ops.map_fn(fn=classifier_fn, + elems=elems, + parallel_iterations=1, + back_prop=False, + swap_memory=True, + name='RunClassifier') - # Split the activations by the real and generated images. - real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) + real_a = compute_activations(real_imgs) + gen_a = compute_activations(generated_imgs) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) @@ -697,18 +697,20 @@ def frechet_classifier_distance_from_activations(real_activations, # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) - num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_generated = math_ops.to_double( + array_ops.shape(generated_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) From c45ffa87d3c7a74a32fcce5c9cebb2a30a2980ab Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 07:36:37 -0700 Subject: [PATCH 0601/1734] Automated g4 rollback of changelist 193234819 PiperOrigin-RevId: 193921660 --- .../ci_build/windows/bazel/bazel_test_lib.sh | 7 +++++ .../windows/cpu/pip/build_tf_windows.sh | 26 +++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh index d654b433e7d..582188fc00b 100644 --- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh +++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh @@ -140,6 +140,13 @@ function run_configure_for_gpu_build { echo "" | ./configure } +function set_gcs_remote_cache_options { + echo "build --experimental_remote_spawn_cache" >> "${TMP_BAZELRC}" + echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}" + echo "build --remote_http_cache=https://storage.googleapis.com/$GCS_BUCKET_NAME" >> "${TMP_BAZELRC}" + echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}" +} + function create_python_test_dir() { rm -rf "$1" mkdir -p "$1" diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh index 5e9ae497e15..8b7495b3b8f 100644 --- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh @@ -42,20 +42,36 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \ || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; } +# Recreate an empty bazelrc file under source root +export TMP_BAZELRC=.tmp.bazelrc +rm -f "${TMP_BAZELRC}" +touch "${TMP_BAZELRC}" + +function cleanup { + # Remove all options in .tmp.bazelrc + echo "" > "${TMP_BAZELRC}" +} +trap cleanup EXIT + skip_test=0 for ARG in "$@"; do if [[ "$ARG" == --skip_test ]]; then skip_test=1 + elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then + set_gcs_remote_cache_options fi done -run_configure_for_cpu_build - # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521 -BUILD_OPTS="--define=override_eigen_strong_inline=true" -bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $? +echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}" + +echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc + +run_configure_for_cpu_build + +bazel build --announce_rc -c opt tensorflow/tools/pip_package:build_pip_package || exit $? if [[ "$skip_test" == 1 ]]; then exit 0 @@ -73,7 +89,7 @@ reinstall_tensorflow_pip ${PIP_NAME} # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore, # which will result testing system installed tensorflow -bazel test -c opt $BUILD_OPTS -k --test_output=errors \ +bazel test -c opt -k --test_output=errors \ --define=no_tensorflow_py_deps=true --test_lang_filters=py \ --test_tag_filters=-no_pip,-no_windows,-no_oss \ --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \ From 9a39d4890da10545f326cf4180d758f2d7c2a3bb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 08:27:07 -0700 Subject: [PATCH 0602/1734] Adds functionality to subsample the inputs to extract image patches. Add functionality to subsample the extracted image patches based on the number of the outer products per entry of the covariance matrix. PiperOrigin-RevId: 193927804 --- .../kernel_tests/fisher_factors_test.py | 15 +++ tensorflow/contrib/kfac/python/ops/BUILD | 3 + .../contrib/kfac/python/ops/fisher_factors.py | 109 +++++++++++++++++- 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py index 2a3592c53fd..432b67e5690 100644 --- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py +++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py @@ -814,6 +814,21 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase): new_cov = sess.run(factor.make_covariance_update_op(0.)) self.assertAllClose([[(1. + 4.) / 2.]], new_cov) + def testSubSample(self): + with tf_ops.Graph().as_default(): + patches_1 = array_ops.constant(1, shape=(10, 2)) + patches_2 = array_ops.constant(1, shape=(10, 8)) + patches_3 = array_ops.constant(1, shape=(3, 3)) + patches_1_sub = ff._subsample_for_cov_computation(patches_1) + patches_2_sub = ff._subsample_for_cov_computation(patches_2) + patches_3_sub = ff._subsample_for_cov_computation(patches_3) + patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0] + patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0] + patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0] + self.assertEqual(2, patches_1_sub_batch_size) + self.assertEqual(8, patches_2_sub_batch_size) + self.assertEqual(3, patches_3_sub_batch_size) + class ConvOutputKroneckerFactorTest(ConvFactorTestCase): diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD index b897fd68a08..cb0917bb851 100644 --- a/tensorflow/contrib/kfac/python/ops/BUILD +++ b/tensorflow/contrib/kfac/python/ops/BUILD @@ -37,10 +37,13 @@ py_library( deps = [ ":utils", "//tensorflow/python:array_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", "//tensorflow/python:init_ops", "//tensorflow/python:linalg_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:random_ops", "//tensorflow/python:special_math_ops", "//tensorflow/python:training", "//tensorflow/python:variable_scope", diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py index 0d40d265a17..b2da13db89f 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.ops import special_math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables @@ -55,6 +56,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2 # matrix powers. Must be nonnegative. EIGENVALUE_CLIPPING_THRESHOLD = 0.0 +# Used to subsample the flattened extracted image patches. The number of +# outer products per row of the covariance matrix should not exceed this +# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True. +_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1 + +# Used to subsample the inputs passed to the extract image patches. The batch +# size of number of inputs to extract image patches is multiplied by this +# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True. +_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5 + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_OUTER_PRODUCTS = False + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_INPUTS = False + # TOWER_STRATEGY can be one of "concat" or "separate". If "concat", the data # passed to the factors from the blocks will be concatenated across towers # (lazilly via PartitionedTensor objects). Otherwise a tuple of tensors over @@ -67,12 +84,20 @@ def set_global_constants(init_covariances_at_zero=None, zero_debias=None, eigenvalue_decomposition_threshold=None, eigenvalue_clipping_threshold=None, + max_num_outer_products_per_cov_row=None, + sub_sample_outer_products=None, + inputs_to_extract_ptaches_factor=None, + sub_sample_inputs=None, tower_strategy=None): """Sets various global constants used by the classes in this module.""" global INIT_COVARIANCES_AT_ZERO global ZERO_DEBIAS global EIGENVALUE_DECOMPOSITION_THRESHOLD global EIGENVALUE_CLIPPING_THRESHOLD + global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW + global _SUB_SAMPLE_OUTER_PRODUCTS + global _INPUTS_TO_EXTRACT_PATCHES_FACTOR + global _SUB_SAMPLE_INPUTS global TOWER_STRATEGY if init_covariances_at_zero is not None: @@ -83,6 +108,14 @@ def set_global_constants(init_covariances_at_zero=None, EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold if eigenvalue_clipping_threshold is not None: EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold + if max_num_outer_products_per_cov_row is not None: + _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row + if sub_sample_outer_products is not None: + _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products + if inputs_to_extract_ptaches_factor is not None: + _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_ptaches_factor + if sub_sample_inputs is not None: + _SUB_SAMPLE_INPUTS = sub_sample_inputs if tower_strategy is not None: TOWER_STRATEGY = tower_strategy @@ -227,6 +260,58 @@ def graph_func_to_string(func): return list_to_string(func.func_id) +def _subsample_for_cov_computation(array, name=None): + """Subsamples the first dimension of the array. + + `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance + matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer + products per row of the covariance matrix is greater than + `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + name: `string`, Default(None) + + Returns: + A tensor of shape `[max_samples, dim_2]`. + + Raises: + ValueError: If array's is not matrix-shaped. + ValueError: If array's batch_size cannot be inferred. + + """ + with tf_ops.name_scope(name, "subsample", [array]): + array = tf_ops.convert_to_tensor(array) + if len(array.shape) != 2: + raise ValueError("Input param array must be a matrix.") + + batch_size = array.shape.as_list()[0] + if batch_size is None: + raise ValueError("Unable to get batch_size from input param array.") + + num_cov_rows = array.shape.as_list()[-1] + max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows) + if batch_size <= max_batch_size: + return array + + return _random_tensor_gather(array, max_batch_size) + + +def _random_tensor_gather(array, max_size): + """Generates a random set of indices and gathers the value at the indcices. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + max_size: int, Number of indices to sample. + + Returns: + A tensor of shape `[max_size, ...]`. + """ + batch_size = array.shape.as_list()[0] + indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size] + return array_ops.gather(array, indices) + + @six.add_metaclass(abc.ABCMeta) class FisherFactor(object): """Base class for objects modeling factors of approximate Fisher blocks. @@ -1153,7 +1238,9 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): dilation_rate=None, data_format=None, extract_patches_fn=None, - has_bias=False): + has_bias=False, + sub_sample_inputs=None, + sub_sample_patches=None): """Initializes ConvInputKroneckerFactor. Args: @@ -1173,6 +1260,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): patches. One of "extract_convolution_patches", "extract_image_patches", "extract_pointwise_conv2d_patches". has_bias: bool. If True, append 1 to in_channel. + sub_sample_inputs: `bool`. If True, then subsample the inputs from which + the image patches are extracted. (Default: None) + sub_sample_patches: `bool`, If `True` then subsample the extracted + patches.(Default: None) """ self._inputs = inputs self._filter_shape = filter_shape @@ -1182,7 +1273,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): self._data_format = data_format self._extract_patches_fn = extract_patches_fn self._has_bias = has_bias + if sub_sample_inputs is None: + self._sub_sample_inputs = _SUB_SAMPLE_INPUTS + else: + self._sub_sample_inputs = sub_sample_inputs + if sub_sample_patches is None: + self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS + else: + self._sub_sample_patches = sub_sample_patches super(ConvInputKroneckerFactor, self).__init__() @property @@ -1215,6 +1314,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): assert source == 0 inputs = self._inputs[tower] + if self._sub_sample_inputs: + batch_size = inputs.shape.as_list()[0] + max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR) + inputs = _random_tensor_gather(inputs, max_size) # TODO(b/64144716): there is potential here for a big savings in terms of # memory use. @@ -1260,8 +1363,12 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): # |Delta| = number of spatial offsets, and J = number of input maps # for convolutional layer l. patches_flat = array_ops.reshape(patches, [-1, flatten_size]) + # We append a homogenous coordinate to patches_flat if the layer has # bias parameters. This gives us [[A_l]]_H from the paper. + if self._sub_sample_patches: + patches_flat = _subsample_for_cov_computation(patches_flat) + if self._has_bias: patches_flat = append_homog(patches_flat) # We call compute_cov without passing in a normalizer. compute_cov uses From fb7ce0375c325fc948b68126082b24bb0486c6a9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 08:43:18 -0700 Subject: [PATCH 0603/1734] Internal Change PiperOrigin-RevId: 193929733 --- tensorflow/compiler/aot/test.cc | 1 + tensorflow/compiler/xla/service/backend.cc | 1 + tensorflow/compiler/xla/shape_util.h | 1 + .../xla/tests/local_client_test_base.cc | 2 +- .../factorization/kernels/clustering_ops.cc | 1 + .../contrib/ffmpeg/default/ffmpeg_lib.cc | 2 +- tensorflow/core/BUILD | 6 ++- .../core/common_runtime/direct_session.cc | 2 +- .../kernel_benchmark_testlib.cc | 1 + .../core/common_runtime/local_device.cc | 1 + .../core/common_runtime/process_util.cc | 1 + tensorflow/core/framework/bfloat16.h | 1 + tensorflow/core/grappler/clusters/utils.cc | 1 + tensorflow/core/grappler/costs/utils.cc | 2 +- tensorflow/core/grappler/devices.cc | 1 + .../grappler/optimizers/constant_folding.cc | 1 + .../adaptive_shared_batch_scheduler.h | 1 + .../batching_util/shared_batch_scheduler.h | 1 + tensorflow/core/kernels/cast_op.h | 2 +- tensorflow/core/kernels/decode_raw_op.cc | 2 +- .../core/kernels/mkl_input_conversion_op.cc | 1 + tensorflow/core/kernels/mkl_tfconv_op.h | 1 + tensorflow/core/kernels/sparse_matmul_op.h | 1 + tensorflow/core/lib/bfloat16/bfloat16.h | 3 +- tensorflow/core/lib/core/coding.cc | 2 +- tensorflow/core/lib/core/raw_coding.h | 2 +- tensorflow/core/lib/gtl/inlined_vector.h | 2 +- tensorflow/core/lib/png/png_io.cc | 2 +- tensorflow/core/lib/wav/wav_io.cc | 2 +- tensorflow/core/platform/byte_order.h | 37 +++++++++++++++++++ tensorflow/core/platform/cpu_feature_guard.cc | 1 + tensorflow/core/platform/cpu_info.h | 7 ++-- tensorflow/core/platform/denormal.cc | 3 +- tensorflow/core/platform/windows/cpu_info.h | 9 ----- 34 files changed, 76 insertions(+), 28 deletions(-) create mode 100644 tensorflow/core/platform/byte_order.h diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc index 47ef5f82cbc..6b098049cbd 100644 --- a/tensorflow/compiler/aot/test.cc +++ b/tensorflow/compiler/aot/test.cc @@ -35,6 +35,7 @@ limitations under the License. // clang-format on #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index a582dbffd68..b1d616ec350 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 63da9154cfc..5fa728e7c2f 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index bb5aabb214d..b615f0feade 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -27,7 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc index 2a6c97e8b95..025534d540b 100644 --- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc +++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc @@ -32,6 +32,7 @@ #include "tensorflow/core/lib/gtl/top_n.h" #include "tensorflow/core/lib/random/philox_random.h" #include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc index 35341406a08..cca1a054193 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc @@ -28,7 +28,7 @@ #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" using tensorflow::strings::StrCat; diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 5b04574a4fa..a2ff29724bb 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -271,7 +271,7 @@ PLATFORM_BASE_HDRS = [ "platform/logging.h", "platform/macros.h", "platform/types.h", - "platform/cpu_info.h", + "platform/byte_order.h", ] PLATFORM_OTHER_HDRS = [ @@ -279,6 +279,7 @@ PLATFORM_OTHER_HDRS = [ "platform/stacktrace.h", "platform/stacktrace_handler.h", "platform/context.h", + "platform/cpu_info.h", "platform/cpu_feature_guard.h", "platform/dynamic_annotations.h", "platform/env.h", @@ -307,7 +308,6 @@ cc_library( srcs = glob([ "platform/*/integral_types.h", "platform/*/logging.h", - "platform/*/cpu_info.h", ]), hdrs = PLATFORM_BASE_HDRS, deps = [ @@ -658,6 +658,7 @@ cc_library( "framework/tensor_types.h", "framework/type_traits.h", "lib/bfloat16/bfloat16.h", + "platform/byte_order.h", "platform/default/dynamic_annotations.h", "platform/default/integral_types.h", "platform/default/logging.h", @@ -1903,6 +1904,7 @@ cc_library( "lib/core/casts.h", "lib/core/stringpiece.h", "lib/png/png_io.h", + "platform/byte_order.h", "platform/cpu_info.h", "platform/default/integral_types.h", "platform/default/logging.h", diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index 0479061daff..0afbd02e866 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -54,7 +54,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/device_tracer.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc index 64d88494756..7de1b80e2d6 100644 --- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc +++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test_benchmark.h" diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc index ca7f1614f1f..873182371e0 100644 --- a/tensorflow/core/common_runtime/local_device.cc +++ b/tensorflow/core/common_runtime/local_device.cc @@ -19,6 +19,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_feature_guard.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc index 22fd940d82d..f8f3a1ecd73 100644 --- a/tensorflow/core/common_runtime/process_util.cc +++ b/tensorflow/core/common_runtime/process_util.cc @@ -21,6 +21,7 @@ limitations under the License. #include #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/tracing.h" diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h index 968c18bdd21..2f79d0fa708 100644 --- a/tensorflow/core/framework/bfloat16.h +++ b/tensorflow/core/framework/bfloat16.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_FRAMEWORK_BFLOAT16_H_ #include "tensorflow/core/framework/numeric_types.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/types.h" #if defined(PLATFORM_WINDOWS) diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc index 50d6e6468fa..a7519725a54 100644 --- a/tensorflow/core/grappler/clusters/utils.cc +++ b/tensorflow/core/grappler/clusters/utils.cc @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/mem.h" diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc index f318e3911c2..be54d98534e 100644 --- a/tensorflow/core/grappler/costs/utils.cc +++ b/tensorflow/core/grappler/costs/utils.cc @@ -44,7 +44,7 @@ limitations under the License. #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc index b318ac22d4b..2be894a08b2 100644 --- a/tensorflow/core/grappler/devices.cc +++ b/tensorflow/core/grappler/devices.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include "tensorflow/core/grappler/devices.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #if GOOGLE_CUDA diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index e29aaa25fe3..45bb188e8db 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/denormal.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/setround.h" diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h index 339d792302d..f5ced95febf 100644 --- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h +++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/thread_annotations.h" diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h index b77289aded4..edc88a03847 100644 --- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h +++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h @@ -30,6 +30,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/thread_annotations.h" diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h index fd4e75d26f0..16d2e0e0a56 100644 --- a/tensorflow/core/kernels/cast_op.h +++ b/tensorflow/core/kernels/cast_op.h @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc index bacacb94ae4..eaef5a6097f 100644 --- a/tensorflow/core/kernels/decode_raw_op.cc +++ b/tensorflow/core/kernels/decode_raw_op.cc @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index dcf6bb9f74e..ea763ce85ba 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/util/tensor_format.h" diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h index ddea9e281b2..4120f013acd 100644 --- a/tensorflow/core/kernels/mkl_tfconv_op.h +++ b/tensorflow/core/kernels/mkl_tfconv_op.h @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/util/tensor_format.h" diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h index 14ef2ed7044..e89280724ee 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.h +++ b/tensorflow/core/kernels/sparse_matmul_op.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_ #include "third_party/eigen3/Eigen/Core" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/types.h" #if defined(PLATFORM_WINDOWS) diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h index 126e5a17af4..e7c24387a48 100644 --- a/tensorflow/core/lib/bfloat16/bfloat16.h +++ b/tensorflow/core/lib/bfloat16/bfloat16.h @@ -19,8 +19,7 @@ limitations under the License. #include #include -// We need cpu_info.h here in order to pick up __BYTE_ORDER__. -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #ifdef __CUDACC__ // All functions callable from CUDA code must be qualified with __device__ diff --git a/tensorflow/core/lib/core/coding.cc b/tensorflow/core/lib/core/coding.cc index bb95c274104..50872eef83a 100644 --- a/tensorflow/core/lib/core/coding.cc +++ b/tensorflow/core/lib/core/coding.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/core/lib/core/coding.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" namespace tensorflow { namespace core { diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h index bbfd33d3037..37201b755d5 100644 --- a/tensorflow/core/lib/core/raw_coding.h +++ b/tensorflow/core/lib/core/raw_coding.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_LIB_CORE_RAW_CODING_H_ #include -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h index 6e3cb2206d9..2011f7d4a11 100644 --- a/tensorflow/core/lib/gtl/inlined_vector.h +++ b/tensorflow/core/lib/gtl/inlined_vector.h @@ -43,7 +43,7 @@ limitations under the License. #include #include "tensorflow/core/lib/gtl/manual_constructor.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc index cba473927dd..62c803afb24 100644 --- a/tensorflow/core/lib/png/png_io.cc +++ b/tensorflow/core/lib/png/png_io.cc @@ -26,7 +26,7 @@ limitations under the License. #include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/png/png_io.h" -#include "tensorflow/core/platform/cpu_info.h" // endian +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/png.h" diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc index 51b9c6cd82c..3f7dbcee85c 100644 --- a/tensorflow/core/lib/wav/wav_io.cc +++ b/tensorflow/core/lib/wav/wav_io.cc @@ -23,7 +23,7 @@ limitations under the License. #include "tensorflow/core/lib/core/coding.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/wav/wav_io.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" diff --git a/tensorflow/core/platform/byte_order.h b/tensorflow/core/platform/byte_order.h new file mode 100644 index 00000000000..aab6535e4b0 --- /dev/null +++ b/tensorflow/core/platform/byte_order.h @@ -0,0 +1,37 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_ +#define TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_ + +// Byte order defines provided by gcc. MSVC doesn't define those so +// we define them here. +// We assume that all windows platform out there are little endian. +#if defined(_MSC_VER) && !defined(__clang__) +#define __ORDER_LITTLE_ENDIAN__ 0x4d2 +#define __ORDER_BIG_ENDIAN__ 0x10e1 +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#endif + +namespace tensorflow { +namespace port { + +// TODO(jeff,sanjay): Make portable +constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; + +} // namespace port +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PLATFORM_BYTE_ORDER_H_ diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc index b5706581580..9d00aa7b7fe 100644 --- a/tensorflow/core/platform/cpu_feature_guard.cc +++ b/tensorflow/core/platform/cpu_feature_guard.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h index bb77650e26e..b5be7e8b545 100644 --- a/tensorflow/core/platform/cpu_info.h +++ b/tensorflow/core/platform/cpu_info.h @@ -18,6 +18,10 @@ limitations under the License. #include +// TODO(ahentz): This is not strictly required here but, for historical +// reasons, many people depend on cpu_info.h in order to use kLittleEndian. +#include "tensorflow/core/platform/byte_order.h" + #if defined(_MSC_VER) #include "tensorflow/core/platform/windows/cpu_info.h" #endif @@ -25,9 +29,6 @@ limitations under the License. namespace tensorflow { namespace port { -// TODO(jeff,sanjay): Make portable -constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; - // Returns an estimate of the number of schedulable CPUs for this // process. Usually, it's constant throughout the lifetime of a // process, but it might change if the underlying cluster management diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc index 82cbc43b4f8..c510dc204f7 100644 --- a/tensorflow/core/platform/denormal.cc +++ b/tensorflow/core/platform/denormal.cc @@ -15,8 +15,9 @@ limitations under the License. #include -#include "tensorflow/core/platform/denormal.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/denormal.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/platform.h" // If we're on gcc 4.8 or older, there's a known bug that prevents the use of diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h index f20939d3c0f..ba2126abcfc 100644 --- a/tensorflow/core/platform/windows/cpu_info.h +++ b/tensorflow/core/platform/windows/cpu_info.h @@ -19,13 +19,4 @@ limitations under the License. // included so __cpuidex function is available for GETCPUID on Windows #include -// Byte order defines provided by gcc. MSVC doesn't define those so -// we define them here. -// We assume that all windows platform out there are little endian. -#if defined(_MSC_VER) && !defined(__clang__) -#define __ORDER_LITTLE_ENDIAN__ 0x4d2 -#define __ORDER_BIG_ENDIAN__ 0x10e1 -#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ -#endif - #endif // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_ From 26ff316f49e613a7f9cba02dd5e7d6cd5aa78623 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Mon, 23 Apr 2018 11:03:13 -0700 Subject: [PATCH 0604/1734] Fix flaky stateful metrics test PiperOrigin-RevId: 193951580 --- .../keras/_impl/keras/engine/network.py | 2 +- .../python/keras/_impl/keras/metrics_test.py | 119 +++++++++--------- 2 files changed, 61 insertions(+), 60 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py index cc177c14a89..3b419dff3a1 100644 --- a/tensorflow/python/keras/_impl/keras/engine/network.py +++ b/tensorflow/python/keras/_impl/keras/engine/network.py @@ -126,7 +126,7 @@ class Network(base_layer.Layer): else: self.outputs = [outputs] - # User-prodived argument validation. + # User-provided argument validation. if context.executing_eagerly(): # Check that all inputs/outputs are DeferredTensors. for tensor in self.inputs: diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py index 9deaab0c056..13cef978127 100644 --- a/tensorflow/python/keras/_impl/keras/metrics_test.py +++ b/tensorflow/python/keras/_impl/keras/metrics_test.py @@ -75,74 +75,75 @@ class KerasMetricsTest(test.TestCase): self.assertEqual(result, 0.) def test_stateful_metrics(self): - np.random.seed(1334) + with self.test_session(): + np.random.seed(1334) - class BinaryTruePositives(keras.layers.Layer): - """Stateful Metric to count the total true positives over all batches. + class BinaryTruePositives(keras.layers.Layer): + """Stateful Metric to count the total true positives over all batches. - Assumes predictions and targets of shape `(samples, 1)`. + Assumes predictions and targets of shape `(samples, 1)`. - Arguments: - threshold: Float, lower limit on prediction value that counts as a - positive class prediction. - name: String, name for the metric. - """ - - def __init__(self, name='true_positives', **kwargs): - super(BinaryTruePositives, self).__init__(name=name, **kwargs) - self.true_positives = keras.backend.variable(value=0, dtype='int32') - - def reset_states(self): - keras.backend.set_value(self.true_positives, 0) - - def __call__(self, y_true, y_pred): - """Computes the number of true positives in a batch. - - Args: - y_true: Tensor, batch_wise labels - y_pred: Tensor, batch_wise predictions - - Returns: - The total number of true positives seen this epoch at the - completion of the batch. + Arguments: + threshold: Float, lower limit on prediction value that counts as a + positive class prediction. + name: String, name for the metric. """ - y_true = math_ops.cast(y_true, 'int32') - y_pred = math_ops.cast(math_ops.round(y_pred), 'int32') - correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32') - true_pos = math_ops.cast( - math_ops.reduce_sum(correct_preds * y_true), 'int32') - current_true_pos = self.true_positives * 1 - self.add_update( - state_ops.assign_add(self.true_positives, true_pos), - inputs=[y_true, y_pred]) - return current_true_pos + true_pos - metric_fn = BinaryTruePositives() - config = keras.metrics.serialize(metric_fn) - metric_fn = keras.metrics.deserialize( - config, custom_objects={'BinaryTruePositives': BinaryTruePositives}) + def __init__(self, name='true_positives', **kwargs): + super(BinaryTruePositives, self).__init__(name=name, **kwargs) + self.true_positives = keras.backend.variable(value=0, dtype='int32') - # Test on simple model - inputs = keras.Input(shape=(2,)) - outputs = keras.layers.Dense(1, activation='sigmoid')(inputs) - model = keras.Model(inputs, outputs) - model.compile(optimizer='sgd', - loss='binary_crossentropy', - metrics=['acc', metric_fn]) + def reset_states(self): + keras.backend.set_value(self.true_positives, 0) - # Test fit, evaluate - samples = 1000 - x = np.random.random((samples, 2)) - y = np.random.randint(2, size=(samples, 1)) - model.fit(x, y, epochs=1, batch_size=10) - outs = model.evaluate(x, y, batch_size=10) - preds = model.predict(x) + def __call__(self, y_true, y_pred): + """Computes the number of true positives in a batch. - def ref_true_pos(y_true, y_pred): - return np.sum(np.logical_and(y_pred > 0.5, y_true == 1)) + Args: + y_true: Tensor, batch_wise labels + y_pred: Tensor, batch_wise predictions - # Test correctness (e.g. updates should have been run) - self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5) + Returns: + The total number of true positives seen this epoch at the + completion of the batch. + """ + y_true = math_ops.cast(y_true, 'int32') + y_pred = math_ops.cast(math_ops.round(y_pred), 'int32') + correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32') + true_pos = math_ops.cast( + math_ops.reduce_sum(correct_preds * y_true), 'int32') + current_true_pos = self.true_positives * 1 + self.add_update( + state_ops.assign_add(self.true_positives, true_pos), + inputs=[y_true, y_pred]) + return current_true_pos + true_pos + + metric_fn = BinaryTruePositives() + config = keras.metrics.serialize(metric_fn) + metric_fn = keras.metrics.deserialize( + config, custom_objects={'BinaryTruePositives': BinaryTruePositives}) + + # Test on simple model + inputs = keras.Input(shape=(2,)) + outputs = keras.layers.Dense(1, activation='sigmoid')(inputs) + model = keras.Model(inputs, outputs) + model.compile(optimizer='sgd', + loss='binary_crossentropy', + metrics=['acc', metric_fn]) + + # Test fit, evaluate + samples = 1000 + x = np.random.random((samples, 2)) + y = np.random.randint(2, size=(samples, 1)) + model.fit(x, y, epochs=1, batch_size=10) + outs = model.evaluate(x, y, batch_size=10) + preds = model.predict(x) + + def ref_true_pos(y_true, y_pred): + return np.sum(np.logical_and(y_pred > 0.5, y_true == 1)) + + # Test correctness (e.g. updates should have been run) + self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5) if __name__ == '__main__': From f0d5d2047833c7221ce3be1690689ca1c6658add Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 11:23:01 -0700 Subject: [PATCH 0605/1734] Convert int -> size_t so that implicit conversion doesn't lose integer precision. PiperOrigin-RevId: 193955175 --- tensorflow/contrib/lite/context.h | 6 +++--- tensorflow/contrib/lite/interpreter.cc | 13 +++++++++---- tensorflow/contrib/lite/interpreter.h | 12 ++++++------ tensorflow/contrib/lite/interpreter_test.cc | 8 ++++---- tensorflow/contrib/lite/optional_debug_tools.cc | 2 +- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h index 0b38f43cd32..12841d233cc 100644 --- a/tensorflow/contrib/lite/context.h +++ b/tensorflow/contrib/lite/context.h @@ -275,7 +275,7 @@ typedef struct { typedef struct TfLiteContext { // Number of tensors in the context. - int tensors_size; + size_t tensors_size; // The execution plan contains a list of the node indices in execution // order. execution_plan->size is the current number of nodes. And, @@ -397,13 +397,13 @@ typedef struct _TfLiteDelegate { // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Copy the data from raw memory to delegate buffer handle. // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Free the Delegate Buffer Handle. Note: This only frees the handle, but // this doesn't release the underlying resource (e.g. textures). The diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index 91b6c414bf0..9d8ea55fd1e 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -308,7 +308,12 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, for (int i = 0; i < length; i++) { int index = indices[i]; - if (index < kOptionalTensor || index >= context_.tensors_size) { + // Continue if index == kOptionalTensor before additional comparisons below, + // size_t(-1) is always >= context_tensors_size. + if (index == kOptionalTensor) { + continue; + } + if (index < 0 || static_cast(index) >= context_.tensors_size) { ReportError(&context_, "Invalid tensor index %d in %s\n", index, label); consistent_ = false; return kTfLiteError; @@ -318,7 +323,7 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, } TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims, - int dims_size, size_t* bytes) { + size_t dims_size, size_t* bytes) { // TODO(aselle): Check for overflow here using overflow.h in TensorFlow // MultiplyWithoutOverflow. TF_LITE_ENSURE(&context_, bytes != nullptr); @@ -645,7 +650,7 @@ TfLiteStatus Interpreter::GetNodeAndRegistration( } TfLiteStatus Interpreter::SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation) { if (state_ == kStateInvokableAndImmutable) { @@ -691,7 +696,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly( // bytes. The lifetime of buffer must be ensured to be greater or equal // to Interpreter. TfLiteStatus Interpreter::SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization) { if (state_ == kStateInvokableAndImmutable) { ReportError( diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index a49134b95ee..6f3433abcf7 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -150,7 +150,7 @@ class Interpreter { }; TfLiteStatus SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation = nullptr); @@ -165,7 +165,7 @@ class Interpreter { dims.data(), quantization); } TfLiteStatus SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization); // Functions to access tensor data @@ -189,10 +189,10 @@ class Interpreter { } // Return the number of tensors in the model. - int tensors_size() const { return context_.tensors_size; } + size_t tensors_size() const { return context_.tensors_size; } // Return the number of ops in the model. - int nodes_size() const { return nodes_and_registration_.size(); } + size_t nodes_size() const { return nodes_and_registration_.size(); } // WARNING: Experimental interface, subject to change const std::vector& execution_plan() const { return execution_plan_; } @@ -406,7 +406,7 @@ class Interpreter { // Compute the number of bytes required to represent a tensor with dimensions // specified by the array dims (of length dims_size). Returns the status code // and bytes. - TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size, + TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size, size_t* bytes); // Request an tensor be resized implementation. If the given tensor is of @@ -467,7 +467,7 @@ class Interpreter { // tensors. After calling this function, adding `kTensorsCapacityHeadroom` // more tensors won't invalidate the pointer to existing tensors. void EnsureTensorsVectorCapacity() { - const int required_capacity = tensors_size() + kTensorsCapacityHeadroom; + const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom; if (required_capacity > tensors_.capacity()) { tensors_.reserve(required_capacity); context_.tensors = tensors_.data(); diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc index 131e0880798..453c1ada1cf 100644 --- a/tensorflow/contrib/lite/interpreter_test.cc +++ b/tensorflow/contrib/lite/interpreter_test.cc @@ -887,15 +887,15 @@ class TestDelegate : public ::testing::Test { TfLiteIntArrayFree(nodes_to_separate); return kTfLiteOk; }; - delegate_.CopyToBufferHandle = [](TfLiteDelegate* delegate, - TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + delegate_.CopyToBufferHandle = + [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc index e1366639c78..e0a09101171 100644 --- a/tensorflow/contrib/lite/optional_debug_tools.cc +++ b/tensorflow/contrib/lite/optional_debug_tools.cc @@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) { // Prints a dump of what tensors and what nodes are in the interpreter. void PrintInterpreterState(Interpreter* interpreter) { - printf("Interpreter has %d tensors and %d nodes\n", + printf("Interpreter has %lu tensors and %lu nodes\n", interpreter->tensors_size(), interpreter->nodes_size()); printf("Inputs:"); PrintIntVector(interpreter->inputs()); From 829ec055afdfca3424030794c469d19290df13fe Mon Sep 17 00:00:00 2001 From: Yifei Feng <1192265+yifeif@users.noreply.github.com> Date: Mon, 23 Apr 2018 11:44:22 -0700 Subject: [PATCH 0606/1734] Update resources.h --- .../core/kernels/boosted_trees/resources.h | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h index ef426048972..df78d3f275b 100644 --- a/tensorflow/core/kernels/boosted_trees/resources.h +++ b/tensorflow/core/kernels/boosted_trees/resources.h @@ -82,26 +82,6 @@ class BoostedTreesEnsembleResource : public StampedResource { int64 GetNumNodes(const int32 tree_id); - void UpdateLastLayerNodesRange(const int32 node_range_start, - int32 node_range_end) const { - tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_start( - node_range_start); - tree_ensemble_->mutable_growing_metadata()->set_last_layer_node_end( - node_range_end); - } - - void GetLastLayerNodesRange(int32* node_range_start, - int32* node_range_end) const { - *node_range_start = - tree_ensemble_->growing_metadata().last_layer_node_start(); - *node_range_end = tree_ensemble_->growing_metadata().last_layer_node_end(); - } - - int64 GetNumNodes(const int32 tree_id) { - DCHECK_LT(tree_id, tree_ensemble_->trees_size()); - return tree_ensemble_->trees(tree_id).nodes_size(); - } - void UpdateGrowingMetadata() const; int32 GetNumLayersAttempted(); From d93e09fbd3408f6ee1647addfdca1eef00139223 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 11:42:02 -0700 Subject: [PATCH 0607/1734] Add fast shuffled fully-connected path also for the case where the RHS has 4 columns (so far was only for the GEMV case where RHS has 1 column). Also pre-shuffle / pre-xor the input activations, not just the weights. We need a buffer for that, so the shuffled FullyConnected operator gets a second output acting as its workspace, similar to what we have been doing for Conv operators needed a im2col workspace buffer. PiperOrigin-RevId: 193958461 --- .../internal/optimized/optimized_ops.h | 448 +++++++++++++----- .../internal/reference/reference_ops.h | 155 ++++-- .../experimental_shuffle_fc_weights.cc | 27 +- tensorflow/contrib/lite/toco/tooling_util.cc | 15 +- 4 files changed, 483 insertions(+), 162 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 2e2721e0930..49ce1133d34 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -1209,109 +1209,275 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, // as the 'task' for worker threads to run (multi-threaded case, see // ExperimentalShuffledFullyConnectedWorkerTask below). inline void ExperimentalShuffledFullyConnectedWorkerImpl( - const uint8* input_data, const int8* shuffled_weights_data, - int output_depth, int accum_depth, const int32* bias_data, + const uint8* shuffled_input_workspace_data, + const int8* shuffled_weights_data, int batches, int output_depth, + int output_stride, int accum_depth, const int32* bias_data, int32 output_multiplier, int output_shift, int16* output_data) { - const int8* shuffled_weights_ptr = shuffled_weights_data; #if defined USE_NEON - // We'll only need to xor signbit to the input activation values, as - // that xor-ing is pre-built into the shuffled weights values. - const uint8x16_t signbit = vdupq_n_u8(0x80); - const int right_shift = output_shift > 0 ? output_shift : 0; - const int left_shift = output_shift > 0 ? 0 : -output_shift; - for (int c = 0; c < output_depth; c += 4) { - // Accumulation loop. - int32x4_t row_accum0 = vdupq_n_s32(0); - int32x4_t row_accum1 = vdupq_n_s32(0); - int32x4_t row_accum2 = vdupq_n_s32(0); - int32x4_t row_accum3 = vdupq_n_s32(0); - for (int d = 0; d < accum_depth; d += 16) { - int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); - int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); - int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); - int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); - shuffled_weights_ptr += 64; - int8x16_t input = - vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d))); - int16x8_t local_accum0 = - vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); - int16x8_t local_accum1 = - vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); - int16x8_t local_accum2 = - vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); - int16x8_t local_accum3 = - vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); - local_accum0 = - vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); - local_accum1 = - vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); - local_accum2 = - vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); - local_accum3 = - vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); - row_accum0 = vpadalq_s16(row_accum0, local_accum0); - row_accum1 = vpadalq_s16(row_accum1, local_accum1); - row_accum2 = vpadalq_s16(row_accum2, local_accum2); - row_accum3 = vpadalq_s16(row_accum3, local_accum3); + const int8* shuffled_weights_ptr = shuffled_weights_data; + if (batches == 1) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + // Accumulation loop. + int32x4_t row_accum0 = vdupq_n_s32(0); + int32x4_t row_accum1 = vdupq_n_s32(0); + int32x4_t row_accum2 = vdupq_n_s32(0); + int32x4_t row_accum3 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input = + vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d)); + int16x8_t local_accum0 = + vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); + int16x8_t local_accum1 = + vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); + int16x8_t local_accum2 = + vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); + int16x8_t local_accum3 = + vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); + local_accum0 = + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); + local_accum1 = + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); + local_accum2 = + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); + local_accum3 = + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); + row_accum0 = vpadalq_s16(row_accum0, local_accum0); + row_accum1 = vpadalq_s16(row_accum1, local_accum1); + row_accum2 = vpadalq_s16(row_accum2, local_accum2); + row_accum3 = vpadalq_s16(row_accum3, local_accum3); + } + // Horizontally reduce accumulators + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, + pairwise_reduced_acc_2, pairwise_reduced_acc_3; + pairwise_reduced_acc_0 = + vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); + pairwise_reduced_acc_1 = + vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); + pairwise_reduced_acc_2 = + vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); + pairwise_reduced_acc_3 = + vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); + const int32x2_t reduced_lo = + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); + const int32x2_t reduced_hi = + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); + // Add bias values. + int32x4_t bias_vec = vld1q_s32(bias_data + c); + reduced = vaddq_s32(reduced, bias_vec); + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, right_shift); + // Narrow values down to 16 bit signed. + const int16x4_t res16 = vqmovn_s32(reduced); + vst1_s16(output_data + c, res16); } - // Horizontally reduce accumulators - int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, - pairwise_reduced_acc_2, pairwise_reduced_acc_3; - pairwise_reduced_acc_0 = - vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); - pairwise_reduced_acc_1 = - vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); - pairwise_reduced_acc_2 = - vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); - pairwise_reduced_acc_3 = - vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); - const int32x2_t reduced_lo = - vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); - const int32x2_t reduced_hi = - vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); - int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); - // Add bias values. - int32x4_t bias_vec = vld1q_s32(bias_data + c); - reduced = vaddq_s32(reduced, bias_vec); - reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, right_shift); - // Narrow values down to 16 bit signed. - const int16x4_t res16 = vqmovn_s32(reduced); - vst1_s16(output_data + c, res16); + } else if (batches == 4) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = + reinterpret_cast(shuffled_input_workspace_data); + // Accumulation loop. + int32x4_t row_accum00 = vdupq_n_s32(0); + int32x4_t row_accum10 = vdupq_n_s32(0); + int32x4_t row_accum20 = vdupq_n_s32(0); + int32x4_t row_accum30 = vdupq_n_s32(0); + int32x4_t row_accum01 = vdupq_n_s32(0); + int32x4_t row_accum11 = vdupq_n_s32(0); + int32x4_t row_accum21 = vdupq_n_s32(0); + int32x4_t row_accum31 = vdupq_n_s32(0); + int32x4_t row_accum02 = vdupq_n_s32(0); + int32x4_t row_accum12 = vdupq_n_s32(0); + int32x4_t row_accum22 = vdupq_n_s32(0); + int32x4_t row_accum32 = vdupq_n_s32(0); + int32x4_t row_accum03 = vdupq_n_s32(0); + int32x4_t row_accum13 = vdupq_n_s32(0); + int32x4_t row_accum23 = vdupq_n_s32(0); + int32x4_t row_accum33 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0); + int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16); + int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32); + int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48); + shuffled_input_ptr += 64; + int16x8_t local_accum0, local_accum1, local_accum2, local_accum3; +#define TFLITE_SHUFFLED_FC_ACCUM(B) \ + local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B)); \ + local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B)); \ + local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B)); \ + local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B)); \ + local_accum0 = \ + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \ + local_accum1 = \ + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \ + local_accum2 = \ + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \ + local_accum3 = \ + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \ + row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0); \ + row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1); \ + row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2); \ + row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3); + + TFLITE_SHUFFLED_FC_ACCUM(0) + TFLITE_SHUFFLED_FC_ACCUM(1) + TFLITE_SHUFFLED_FC_ACCUM(2) + TFLITE_SHUFFLED_FC_ACCUM(3) + +#undef TFLITE_SHUFFLED_FC_ACCUM + } + // Horizontally reduce accumulators + +#define TFLITE_SHUFFLED_FC_STORE(B) \ + { \ + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, \ + pairwise_reduced_acc_2, pairwise_reduced_acc_3; \ + pairwise_reduced_acc_0 = \ + vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \ + pairwise_reduced_acc_1 = \ + vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \ + pairwise_reduced_acc_2 = \ + vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \ + pairwise_reduced_acc_3 = \ + vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \ + const int32x2_t reduced_lo = \ + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); \ + const int32x2_t reduced_hi = \ + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); \ + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); \ + int32x4_t bias_vec = vld1q_s32(bias_data + c); \ + reduced = vaddq_s32(reduced, bias_vec); \ + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); \ + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); \ + using gemmlowp::RoundingDivideByPOT; \ + reduced = RoundingDivideByPOT(reduced, right_shift); \ + const int16x4_t res16 = vqmovn_s32(reduced); \ + vst1_s16(output_data + c + B * output_stride, res16); \ + } + + TFLITE_SHUFFLED_FC_STORE(0); + TFLITE_SHUFFLED_FC_STORE(1); + TFLITE_SHUFFLED_FC_STORE(2); + TFLITE_SHUFFLED_FC_STORE(3); + +#undef TFLITE_SHUFFLED_FC_STORE + } + } else { + TFLITE_DCHECK(false); + return; } #else - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } for (int i = 0; i < 4; i++) { - for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[c + i] = acc; + } + } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[b * output_stride + c + i] = acc; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, -32768); - acc = std::min(acc, 32767); - output_data[c + i] = acc; - } + } else { + TFLITE_DCHECK(false); + return; } #endif } @@ -1320,12 +1486,15 @@ inline void ExperimentalShuffledFullyConnectedWorkerImpl( // to allow using gemmlowp's threadpool. struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { ExperimentalShuffledFullyConnectedWorkerTask( - const uint8* input_data, const int8* shuffled_weights_data, - int output_depth, int accum_depth, const int32* bias_data, - int32 output_multiplier, int output_shift, int16* output_data) + const uint8* input_data, const int8* shuffled_weights_data, int batches, + int output_depth, int output_stride, int accum_depth, + const int32* bias_data, int32 output_multiplier, int output_shift, + int16* output_data) : input_data_(input_data), shuffled_weights_data_(shuffled_weights_data), + batches_(batches), output_depth_(output_depth), + output_stride_(output_stride), accum_depth_(accum_depth), bias_data_(bias_data), output_multiplier_(output_multiplier), @@ -1334,13 +1503,16 @@ struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { void Run() override { ExperimentalShuffledFullyConnectedWorkerImpl( - input_data_, shuffled_weights_data_, output_depth_, accum_depth_, - bias_data_, output_multiplier_, output_shift_, output_data_); + input_data_, shuffled_weights_data_, batches_, output_depth_, + output_stride_, accum_depth_, bias_data_, output_multiplier_, + output_shift_, output_data_); } const uint8* input_data_; const int8* shuffled_weights_data_; + int batches_; int output_depth_; + int output_stride_; int accum_depth_; const int32* bias_data_; int32 output_multiplier_; @@ -1354,7 +1526,7 @@ inline void ExperimentalShuffledFullyConnected( const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label( "ExperimentalShuffledFullyConnected/8bit"); (void)gemm_context; // only used in optimized code. @@ -1371,10 +1543,8 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) // so that just reinterpreting them as int8 values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of @@ -1382,18 +1552,71 @@ inline void ExperimentalShuffledFullyConnected( const int8* int8_shuffled_weights_data = reinterpret_cast(shuffled_weights_data); - // Our GEMV kernel has 4 rows. This doesn't matter in practice for GEMV - // shapes, gemmlowp::HowManyThreads only takes that parameter because it - // matters for other kinds of GEMM shapes. + // Shuffling and xoring of input activations into the workspace buffer + if (batches == 1) { +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (int i = 0; i < accum_depth; i += 16) { + uint8x16_t val = vld1q_u8(input_data + i); + val = veorq_u8(val, signbit); + vst1q_u8(shuffled_input_workspace_data + i, val); + } +#else + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } +#endif + } else if (batches == 4) { + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + int c = 0; +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (c = 0; c < accum_depth; c += 16) { + const uint8* src_data_ptr = input_data + c; + uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth); + uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth); + uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth); + uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth); + val0 = veorq_u8(val0, signbit); + val1 = veorq_u8(val1, signbit); + val2 = veorq_u8(val2, signbit); + val3 = veorq_u8(val3, signbit); + vst1q_u8(shuffled_input_workspace_ptr + 0, val0); + vst1q_u8(shuffled_input_workspace_ptr + 16, val1); + vst1q_u8(shuffled_input_workspace_ptr + 32, val2); + vst1q_u8(shuffled_input_workspace_ptr + 48, val3); + shuffled_input_workspace_ptr += 64; + } +#else + for (c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; + for (int j = 0; j < 16; j++) { + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; + } + } + } +#endif + } else { + TFLITE_DCHECK(false); + return; + } + static constexpr int kKernelRows = 4; const int thread_count = gemmlowp::HowManyThreads( - gemm_context->max_num_threads(), output_depth, 1, accum_depth); + gemm_context->max_num_threads(), output_depth, batches, accum_depth); if (thread_count == 1) { // Single-thread case: do the computation on the current thread, don't // use a threadpool ExperimentalShuffledFullyConnectedWorkerImpl( - input_data, int8_shuffled_weights_data, output_depth, accum_depth, - bias_data, output_multiplier, output_shift, output_data); + shuffled_input_workspace_data, int8_shuffled_weights_data, batches, + output_depth, output_depth, accum_depth, bias_data, output_multiplier, + output_shift, output_data); return; } @@ -1406,8 +1629,9 @@ inline void ExperimentalShuffledFullyConnected( for (int i = 0; i < thread_count; i++) { int row_end = std::min(output_depth, row_start + kRowsPerWorker); tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( - input_data, int8_shuffled_weights_data + row_start * accum_depth, - row_end - row_start, accum_depth, bias_data + row_start, + shuffled_input_workspace_data, + int8_shuffled_weights_data + row_start * accum_depth, batches, + row_end - row_start, output_depth, accum_depth, bias_data + row_start, output_multiplier, output_shift, output_data + row_start); row_start = row_end; } diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 49a93b0c6de..d1d4f54f86a 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -608,8 +608,9 @@ inline void ExperimentalShuffledFullyConnected( const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { (void)gemm_context; // only used in optimized code. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); @@ -622,44 +623,130 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); + + // Shuffling and xoring of input activations into the workspace buffer + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + if (batches == 1) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } + } else if (batches == 4) { + for (int c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[c + i] = acc; + } else { + TFLITE_DCHECK(false); + return; + } + + // Actual computation + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[c + i] = acc; + } } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[b * output_depth + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; } } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc index f098981a5cf..c00cdcb944b 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc @@ -55,17 +55,26 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { // Exit if, based on the known shapes, this FC op is not a GEMV. // The shuffling of FC weights is only useful to enable fast GEMV paths. const Shape& input_shape = input_array.shape(); - for (int i = 0; i < input_shape.dimensions_count() - 1; i++) { + for (int i = 1; i < input_shape.dimensions_count() - 1; i++) { if (input_shape.dims(i) != 1) { // The input activations, shaped as a matrix, have multiple columns. // This FC op isn't a matrix*vector multiplication. AddMessageF( "Not applying experimental shuffling to the weights of %s because " - "it's not a matrix*vector product", + "the input shape is not 1D or 2D (possibly with additional inner " + "dimensions of size 1)", LogName(*op)); return false; } } + if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) { + AddMessageF( + "Not applying experimental shuffling to the weights of %s because " + "the input shape's leading dimension, i.e. the 'batch size', is not " + "equal to 1 or 4", + LogName(*op)); + return false; + } // Exit if the weights shape isn't an integral multiple of the shuffled // block shape, 4x16. We don't want to have to write code dealing with // odd sizes, that would go un-exercised at the moment as the models @@ -129,6 +138,20 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { fc_op->experimental_shuffled_weights = true; AddMessageF("Applied experimental shuffling to the weights of %s", LogName(*op)); + // Add a second output array to this FC op, serving as a workspace to perform + // runtime shuffling/xoring of its input activations. + CHECK_EQ(fc_op->outputs.size(), 1); + const string& shuffled_input_workspace_array_name = + AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled"); + fc_op->outputs.push_back(shuffled_input_workspace_array_name); + auto& shuffled_input_workspace_array = + model->GetOrCreateArray(shuffled_input_workspace_array_name); + shuffled_input_workspace_array.data_type = input_array.data_type; + *shuffled_input_workspace_array.mutable_shape() = input_array.shape(); + shuffled_input_workspace_array.GetOrCreateMinMax() = input_array.GetMinMax(); + shuffled_input_workspace_array.GetOrCreateQuantizationParams() = + input_array.GetQuantizationParams(); + return true; } diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc index cf2cbeedc77..5a341294db5 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.cc +++ b/tensorflow/contrib/lite/toco/tooling_util.cc @@ -1405,20 +1405,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) { } input_minmax.min = (qmin - mean_value) / std_value; input_minmax.max = (qmax - mean_value) / std_value; - if (input_array.minmax) { - if (input_array_proto.has_mean_value() || - input_array_proto.has_std_value()) { - const double width = input_minmax.max - input_minmax.min; - const double kMinMaxAllowedDiff = 1e-6 * width; - CHECK(std::abs(input_minmax.min - input_array.minmax->min) < - kMinMaxAllowedDiff && - std::abs(input_minmax.max - input_array.minmax->max) < - kMinMaxAllowedDiff) - << input_minmax.min << ", " << input_minmax.max - << " != " << input_array.minmax->min << ", " - << input_array.minmax->max; - } - } else { + if (!input_array.minmax) { input_array.GetOrCreateMinMax() = input_minmax; } } From 89ff74a7b25c01a511e84a805d3b2edf780142a6 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 23 Apr 2018 12:03:19 -0700 Subject: [PATCH 0608/1734] [XLA] Disallow conversion from StatusOr to StatusOr if T is not convertible to U. PiperOrigin-RevId: 193962287 --- tensorflow/compiler/xla/statusor.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h index 641b5e9a6ac..cccbce5fc83 100644 --- a/tensorflow/compiler/xla/statusor.h +++ b/tensorflow/compiler/xla/statusor.h @@ -113,17 +113,19 @@ class StatusOr : private internal_statusor::StatusOrData, StatusOr& operator=(StatusOr&&) = default; // Conversion copy/move constructor, T must be convertible from U. - // TODO(b/62186717): These should not participate in overload resolution if U - // is not convertible to T. - template + template ::value>::type* = nullptr> StatusOr(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr(StatusOr&& other); // Conversion copy/move assignment operator, T must be convertible from U. - template + template ::value>::type* = nullptr> StatusOr& operator=(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr& operator=(StatusOr&& other); // Constructs a new StatusOr with the given value. After calling this @@ -233,12 +235,14 @@ StatusOr& StatusOr::operator=(Status&& status) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(const StatusOr& other) : Base(static_cast::Base&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(const StatusOr& other) { if (other.ok()) this->Assign(other.ValueOrDie()); @@ -248,12 +252,14 @@ inline StatusOr& StatusOr::operator=(const StatusOr& other) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(StatusOr&& other) : Base(static_cast::Base&&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(StatusOr&& other) { if (other.ok()) { this->Assign(std::move(other).ValueOrDie()); From 4adc560844c4d769efdaeb5b67d5ace1e0df7b16 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 12:21:29 -0700 Subject: [PATCH 0609/1734] Rewrite tail recursion in loop optimizer as loop to avoid stack overflow. PiperOrigin-RevId: 193965038 --- .../grappler/optimizers/loop_optimizer.cc | 70 +++++++++++-------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc index fff06dd2ace..f7994221bb3 100644 --- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc @@ -320,42 +320,50 @@ Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() { return Status::OK(); } -Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(NodeDef* node) { - auto consumers = node_map_->GetOutputs(node->name()); - invariant_nodes_.insert(std::make_pair(node, consumers.size())); - for (auto* consumer : consumers) { - if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) { - continue; - } - bool is_invariant = true; - for (const auto& input : consumer->input()) { - if (!IsControlInput(input)) { - const string name = NodeName(input); - auto* producer = node_map_->GetNode(name); - if (!invariant_nodes_.count(producer)) { - if (IsConstant(*producer)) { - invariant_nodes_.insert( - std::make_pair(producer, node_map_->GetOutputs(name).size())); - } else { - is_invariant = false; - break; +Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes( + NodeDef* start_node) { + std::vector stack; + stack.reserve(32); + stack.push_back(start_node); + while (!stack.empty()) { + NodeDef* node = stack.back(); + stack.pop_back(); + auto consumers = node_map_->GetOutputs(node->name()); + invariant_nodes_.emplace(node, consumers.size()); + for (auto* consumer : consumers) { + if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) { + continue; + } + bool is_invariant = true; + for (const auto& input : consumer->input()) { + if (!IsControlInput(input)) { + const string name = NodeName(input); + auto* producer = node_map_->GetNode(name); + if (!invariant_nodes_.count(producer)) { + if (IsConstant(*producer)) { + invariant_nodes_.insert( + std::make_pair(producer, node_map_->GetOutputs(name).size())); + } else { + is_invariant = false; + break; + } } } } - } - if (is_invariant) { - std::set producers; - for (const auto& input : consumer->input()) { - auto* producer = node_map_->GetNode(input); - producers.insert(producer); - } - for (auto* producer : producers) { - auto iter = invariant_nodes_.find(producer); - if (iter != invariant_nodes_.end()) { - --iter->second; + if (is_invariant) { + std::set producers; + for (const auto& input : consumer->input()) { + auto* producer = node_map_->GetNode(input); + producers.insert(producer); } + for (auto* producer : producers) { + auto iter = invariant_nodes_.find(producer); + if (iter != invariant_nodes_.end()) { + --iter->second; + } + } + stack.push_back(consumer); } - TF_RETURN_IF_ERROR(FindInvariantNodes(consumer)); } } return Status::OK(); From 7de04c4cd9fb6a38b1b34d02fed14c89057bf002 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Mon, 23 Apr 2018 12:21:57 -0700 Subject: [PATCH 0610/1734] Add TensorFlow format support to tf.keras.Model.save_weights and load_weights Supports restore-on-create in subclassed Models when executing eagerly, and removes the requirement that the Model be built before weights are loaded. Currently only subclassed Models work with the TensorFlow weight format. Graph networks will need a bit of extra logic to support the same topology/by-name distinction as the current HDF5 format (and for now they don't even add Checkpointable dependencies on their sub-layers). Some notes: - Checkpoints won't be numbered. This keeps behavior the same as for existing HDF5 weight saving. - All dependencies will be saved for subclassed Models, not just layers. This will make it more useful for training checkpoints (you can assign an optimizer to an attribute and save the slot variables that way). - Subclassed models won't support loading by flattened weight list from the TensorFlow format. Since there's no global naming for Layers (it's local to the Model), I think this is reasonable. PiperOrigin-RevId: 193965120 --- .../keras/_impl/keras/engine/base_layer.py | 9 + .../keras/_impl/keras/engine/network.py | 204 +++++++++++++--- .../keras/_impl/keras/engine/saving_test.py | 227 +++++++++++++++++- .../keras/_impl/keras/engine/training.py | 3 + .../_impl/keras/model_subclassing_test.py | 29 ++- .../python/training/checkpointable_utils.py | 12 +- .../api/golden/tensorflow.keras.-model.pbtxt | 2 +- .../golden/tensorflow.keras.-sequential.pbtxt | 2 +- .../tensorflow.keras.models.-model.pbtxt | 2 +- .../tensorflow.keras.models.-sequential.pbtxt | 2 +- tensorflow/tools/ci_build/ci_sanity.sh | 1 + 11 files changed, 436 insertions(+), 57 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py index 6c68d251275..abae6c3785b 100644 --- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py +++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py @@ -726,8 +726,17 @@ class Layer(checkpointable.CheckpointableBase): if hasattr(self, '_initial_weights') and self._initial_weights is not None: self.set_weights(self._initial_weights) del self._initial_weights + self._post_build_cleanup() return outputs + def _post_build_cleanup(self): + """Hooks to run after all sub-Layers are built.""" + # Note that in addition to Layer.__call__, this method is called by Model + # after building a graph network (which skips __call__). It should be called + # when possible if self.built may have switched from False to True, and is + # idempotent. + pass # No-op for Layers which don't override this method. + def apply(self, inputs, *args, **kwargs): """Apply the layer on a input. diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py index 3b419dff3a1..4127c781eb4 100644 --- a/tensorflow/python/keras/_impl/keras/engine/network.py +++ b/tensorflow/python/keras/_impl/keras/engine/network.py @@ -22,11 +22,14 @@ from __future__ import print_function import copy import json import os +import weakref import numpy as np from six.moves import zip # pylint: disable=redefined-builtin +from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context +from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.keras._impl.keras import backend as K @@ -37,6 +40,7 @@ from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_wi from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import checkpointable +from tensorflow.python.training import checkpointable_utils from tensorflow.python.util import nest from tensorflow.python.util import tf_inspect @@ -114,6 +118,13 @@ class Network(base_layer.Layer): self._outbound_nodes = [] self._inbound_nodes = [] + self._checkpointable_saver = checkpointable_utils.CheckpointableSaver( + weakref.ref(self)) + # A zero-argument function which should be called and set back to None as + # soon as the network is built (only applicable to subclassed Models). Runs + # restore operations when graph building. + self._in_progress_restore_finalizer = None + def _init_graph_network(self, inputs, outputs, name=None): self._uses_inputs_arg = True # Normalize and set self.inputs, self.outputs. @@ -1125,62 +1136,179 @@ class Network(base_layer.Layer): from tensorflow.python.keras._impl.keras.models import save_model # pylint: disable=g-import-not-at-top save_model(self, filepath, overwrite, include_optimizer) - def save_weights(self, filepath, overwrite=True): - """Dumps all layer weights to a HDF5 file. + def save_weights(self, filepath, overwrite=True, save_format=None): + """Saves all layer weights. - The weight file has: - - `layer_names` (attribute), a list of strings - (ordered names of model layers). - - For every layer, a `group` named `layer.name` - - For every such layer group, a group attribute `weight_names`, - a list of strings - (ordered names of weights tensor of the layer). - - For every weight in the layer, a dataset - storing the weight value, named after the weight tensor. + Either saves in HDF5 or in TensorFlow format based on the `save_format` + argument. + + When saving in HDF5 format, the weight file has: + - `layer_names` (attribute), a list of strings + (ordered names of model layers). + - For every layer, a `group` named `layer.name` + - For every such layer group, a group attribute `weight_names`, + a list of strings + (ordered names of weights tensor of the layer). + - For every weight in the layer, a dataset + storing the weight value, named after the weight tensor. + + Currently the TensorFlow format is only supported for user-defined classes + inheriting from `tf.keras.Model`, and not for networks constructed from + inputs and outputs (using `tf.keras.Model(inputs, outputs)`). + + When saving in TensorFlow format, all objects referenced by the network are + saved in the same format as `tf.train.Checkpoint`, including any `Layer`s or + `Optimizer`s assigned to attributes in the constructor. See + `tf.train.Checkpoint`'s documentation for details. Arguments: - filepath: String, path to the file to save the weights to. + filepath: String, path to the file to save the weights to. When saving + in TensorFlow format, this is the prefix used for checkpoint files + (multiple files are generated). Note that the '.h5' suffix causes + weights to be saved in HDF5 format. overwrite: Whether to silently overwrite any existing file at the target location, or provide the user with a manual prompt. + save_format: Either 'tf' or 'h5'. If `None`, defaults to 'tf' for + user-defined classes inheriting from `tf.keras.Model` and 'h5' for + networks constructed from inputs and outputs. `filepath`s ending in + '.h5' or '.keras' always default to HDF5. Currently only 'h5' is + supported for networks constructed from inputs and outputs. Once + supported, the default for all networks will switch to 'tf'. Raises: - ImportError: If h5py is not available. + ImportError: If h5py is not available when attempting to save in HDF5 + format. + ValueError: For invalid/unknown format arguments. """ - if h5py is None: - raise ImportError('`save_weights` requires h5py.') + filepath_is_h5 = filepath.endswith('.h5') or filepath.endswith('.keras') + if save_format is None: + if filepath_is_h5: + save_format = 'h5' + else: + if self._is_graph_network: + # TODO(allenl): Handle loading by weight index and fix dependencies, + # then enable 'tensorflow' format by default for graph networks. + save_format = 'h5' + else: + # Subclassed models save in TensorFlow format by default. + save_format = 'tf' + else: + user_format = save_format.lower().strip() + if user_format in ('tensorflow', 'tf'): + save_format = 'tf' + elif user_format in ('hdf5', 'h5', 'keras'): + save_format = 'h5' + else: + raise ValueError( + 'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % ( + save_format,)) + if save_format == 'tf' and filepath_is_h5: + raise ValueError( + ('save_weights got save_format="tf"/"tensorflow", but the ' + 'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" ' + 'when saving in TensorFlow format.') + % filepath) + + if save_format == 'h5' and h5py is None: + raise ImportError( + '`save_weights` requires h5py when saving in hdf5.') + if save_format == 'tf': + if self._is_graph_network: + raise NotImplementedError( + 'Networks constructed from inputs and outputs do not yet support ' + 'saving weights in the TensorFlow ("tf") save_format.') + check_filepath = filepath + '.index' + else: + check_filepath = filepath # If file exists and should not be overwritten: - if not overwrite and os.path.isfile(filepath): - proceed = ask_to_proceed_with_overwrite(filepath) + if not overwrite and os.path.isfile(check_filepath): + proceed = ask_to_proceed_with_overwrite(check_filepath) if not proceed: return - with h5py.File(filepath, 'w') as f: - saving.save_weights_to_hdf5_group(f, self.layers) + if save_format == 'h5': + with h5py.File(filepath, 'w') as f: + saving.save_weights_to_hdf5_group(f, self.layers) + else: + self._checkpointable_saver.save(filepath) def load_weights(self, filepath, by_name=False): - """Loads all layer weights from a HDF5 save file. + """Loads all layer weights, either from a TensorFlow or an HDF5 weight file. - If `by_name` is False (default) weights are loaded - based on the network's topology, meaning the architecture - should be the same as when the weights were saved. - Note that layers that don't have weights are not taken - into account in the topological ordering, so adding or - removing layers is fine as long as they don't have weights. + If `by_name` is False weights are loaded based on the network's + topology. This means the architecture should be the same as when the weights + were saved. Note that layers that don't have weights are not taken into + account in the topological ordering, so adding or removing layers is fine as + long as they don't have weights. - If `by_name` is True, weights are loaded into layers - only if they share the same name. This is useful - for fine-tuning or transfer-learning models where + If `by_name` is True, weights are loaded into layers only if they share the + same name. This is useful for fine-tuning or transfer-learning models where some of the layers have changed. + Only topological loading (`by_name=False`) is supported when loading weights + from the TensorFlow format. Note that topological loading differs slightly + between TensorFlow and HDF5 formats for user-defined classes inheriting from + `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the + TensorFlow format loads based on the object-local names of attributes to + which layers are assigned in the `Model`'s constructor. + Arguments: - filepath: String, path to the weights file to load. - by_name: Boolean, whether to load weights by name - or by topological order. + filepath: String, path to the weights file to load. For weight files in + TensorFlow format, this is the file prefix (the same as was passed + to `save_weights`). + by_name: Boolean, whether to load weights by name or by topological + order. Only topological loading is supported for weight files in + TensorFlow format. + + Returns: + When loading a weight file in TensorFlow format, returns the same status + object as `tf.train.Checkpoint.restore`. When graph building, restore + ops are run automatically as soon as the network is built (on first call + for user-defined classes inheriting from `Model`, immediately if it is + already built). + + When loading weights in HDF5 format, returns `None`. Raises: - ImportError: If h5py is not available. + ImportError: If h5py is not available and the weight file is in HDF5 + format. """ + if self._is_graph_network: + # Graph networks do not currently support TensorFlow formatted weight + # files. + save_format = 'h5' + else: + save_format = None + if save_format is None: + try: + pywrap_tensorflow.NewCheckpointReader(filepath) + save_format = 'tf' + except errors_impl.DataLossError: + # The checkpoint is not readable in TensorFlow format. Try HDF5. + save_format = 'h5' + if save_format == 'tf': + status = self._checkpointable_saver.restore(filepath) + if by_name: + raise NotImplementedError( + 'Weights may only be loaded based on topology into Models when ' + 'loading TensorFlow-formatted weights (got by_name=True to ' + 'load_weights).') + if not context.executing_eagerly(): + finalizer = status.run_restore_ops + if self.built: + finalizer() + else: + # Hold on to this status object until the network is built (for + # subclassed Models). Then we'll run restore ops if necessary. + self._in_progress_restore_finalizer = finalizer + return status if h5py is None: - raise ImportError('`load_weights` requires h5py.') + raise ImportError( + '`load_weights` requires h5py when loading weights from HDF5.') + if self._is_graph_network and not self.built: + raise NotImplementedError( + 'Unable to load weights saved in HDF5 format into a subclassed ' + 'Model which has not created its variables yet. Call the Model ' + 'first, then load the weights.') with h5py.File(filepath, 'r') as f: if 'layer_names' not in f.attrs and 'model_weights' in f: f = f['model_weights'] @@ -1189,6 +1317,14 @@ class Network(base_layer.Layer): else: saving.load_weights_from_hdf5_group(f, self.layers) + def _post_build_cleanup(self): + super(Network, self)._post_build_cleanup() + if self._in_progress_restore_finalizer is not None: + # Runs queued restore operations left over from load_weights when graph + # building. + self._in_progress_restore_finalizer() + self._in_progress_restore_finalizer = None + def _updated_config(self): """Util shared between different serialization methods. diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py index 3b1578cddfd..8764ae5e9cf 100644 --- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py @@ -24,7 +24,15 @@ import tempfile import numpy as np +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util from tensorflow.python.keras._impl import keras +from tensorflow.python.keras._impl.keras.engine import training +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import random_ops from tensorflow.python.platform import test from tensorflow.python.training import training as training_module @@ -55,12 +63,16 @@ class TestWeightSavingAndLoading(test.TestCase): with self.assertRaises(ValueError): model.set_weights(weights[::-1]) - if h5py is None: - return # Skip rest of test if H5py isn't available. - temp_dir = self.get_temp_dir() self.addCleanup(shutil.rmtree, temp_dir) + no_extension_path = os.path.join(temp_dir, 'test') + with self.assertRaises(NotImplementedError): + model.save_weights(no_extension_path, save_format='tensorflow') + + if h5py is None: + return # Skip rest of test if H5py isn't available. + h5_path = os.path.join(temp_dir, 'test.h5') model.save_weights(h5_path) model.load_weights(h5_path) @@ -71,6 +83,16 @@ class TestWeightSavingAndLoading(test.TestCase): y = model.predict(x) self.assertAllClose(ref_y, y) + model.save_weights(no_extension_path) + model.load_weights(no_extension_path) + y = model.predict(x) + self.assertAllClose(ref_y, y) + + model.save_weights(no_extension_path, save_format='hdf5') + model.load_weights(no_extension_path) + y = model.predict(x) + self.assertAllClose(ref_y, y) + def test_weight_preprocessing(self): input_dim = 3 output_dim = 3 @@ -457,5 +479,204 @@ class TestWholeModelSaving(test.TestCase): os.remove(fname) +class SubclassedModel(training.Model): + + def __init__(self): + super(SubclassedModel, self).__init__() + self.x_layer = keras.layers.Dense(3) + self.b_layer = keras.layers.Dense(1) + + def call(self, a): + return self.b_layer(self.x_layer(a)) + + +# TODO(allenl): The graph model tests in this TestCase are still saving in +# hdf5. Get them to save in tensorflow format. +class TestWeightSavingAndLoadingTFFormat(test.TestCase): + + @test_util.run_in_graph_and_eager_modes() + def test_tensorflow_format_overwrite(self): + with self.test_session() as session: + model = SubclassedModel() + temp_dir = self.get_temp_dir() + prefix = os.path.join(temp_dir, 'ckpt') + + x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32) + executing_eagerly = context.executing_eagerly() + model(x) # pylint: disable=not-callable + if not executing_eagerly: + session.run([v.initializer for v in model.variables]) + model.save_weights(prefix, save_format='tensorflow') + model.save_weights(prefix, save_format='tensorflow', overwrite=True) + with self.assertRaises(EOFError): + # Indirectly tests that the user is prompted + model.save_weights(prefix, save_format='tensorflow', overwrite=False) + + def test_no_graph_pollution(self): + with context.graph_mode(): + graph = ops.Graph() + with graph.as_default(), self.test_session(graph) as session: + model = SubclassedModel() + temp_dir = self.get_temp_dir() + prefix = os.path.join(temp_dir, 'ckpt') + + x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32) + model(x) # pylint: disable=not-callable + session.run([v.initializer for v in model.variables]) + model.save_weights(prefix, save_format='tensorflow') + op_count = len(graph.get_operations()) + model.save_weights(prefix, save_format='tensorflow') + self.assertEqual(len(graph.get_operations()), op_count) + + model.load_weights(prefix) + op_count = len(graph.get_operations()) + model.load_weights(prefix) + self.assertEqual(len(graph.get_operations()), op_count) + + def _weight_loading_test_template(self, make_model_fn): + with self.test_session() as session: + model = make_model_fn() + temp_dir = self.get_temp_dir() + prefix = os.path.join(temp_dir, 'ckpt') + + x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32) + executing_eagerly = context.executing_eagerly() + ref_y_tensor = model(x) + if not executing_eagerly: + session.run([v.initializer for v in model.variables]) + ref_y = self.evaluate(ref_y_tensor) + model.save_weights(prefix) + for v in model.variables: + self.evaluate( + v.assign(random_ops.random_normal(shape=array_ops.shape(v)))) + + self.addCleanup(shutil.rmtree, temp_dir) + + model.load_weights(prefix) + y = self.evaluate(model(x)) + self.assertAllClose(ref_y, y) + + # Test restore-on-create if this is a subclassed Model (graph Networks + # will have already created their variables). + load_model = make_model_fn() + load_model.load_weights(prefix) + restore_on_create_y_tensor = load_model(x) + restore_on_create_y = self.evaluate(restore_on_create_y_tensor) + self.assertAllClose(ref_y, restore_on_create_y) + + @test_util.run_in_graph_and_eager_modes() + def test_weight_loading_graph_model(self): + def _make_graph_model(): + a = keras.layers.Input(shape=(2,)) + x = keras.layers.Dense(3)(a) + b = keras.layers.Dense(1)(x) + return keras.models.Model(a, b) + + if h5py is None: + self.skipTest('This test only works with h5py.') + + self._weight_loading_test_template(_make_graph_model) + + @test_util.run_in_graph_and_eager_modes() + def test_weight_loading_subclassed_model(self): + self._weight_loading_test_template(SubclassedModel) + + def _new_layer_weight_loading_test_template( + self, first_model_fn, second_model_fn, restore_init_fn, by_name): + with self.test_session() as session: + model = first_model_fn() + temp_dir = self.get_temp_dir() + prefix = os.path.join(temp_dir, 'ckpt') + + x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32) + executing_eagerly = context.executing_eagerly() + ref_y_tensor = model(x) + if not executing_eagerly: + session.run([v.initializer for v in model.variables]) + ref_y = self.evaluate(ref_y_tensor) + model.save_weights(prefix) + for v in model.variables: + self.evaluate( + v.assign(random_ops.random_normal(shape=array_ops.shape(v)))) + + self.addCleanup(shutil.rmtree, temp_dir) + + second_model = second_model_fn() + second_model.load_weights(prefix, by_name=by_name) + second_model(x) + self.evaluate(restore_init_fn(second_model)) + second_model.save_weights(prefix) + # Check that the second model's checkpoint loads into the original model + model.load_weights(prefix, by_name=by_name) + y = self.evaluate(model(x)) + self.assertAllClose(ref_y, y) + + @test_util.run_in_graph_and_eager_modes() + def test_weight_loading_graph_model_added_layer(self): + def _save_graph_model(): + a = keras.layers.Input(shape=(2,)) + x = keras.layers.Dense(3, name='first')(a) + b = keras.layers.Dense(1, name='second')(x) + return keras.models.Model(a, b) + def _restore_graph_model(): + a = keras.layers.Input(shape=(2,)) + x = keras.layers.Dense(3, name='first')(a) + y = keras.layers.Dense(1, name='second')(x) + b = keras.layers.Dense(3, name='secondjr')(y) + return keras.models.Model(a, b) + def _restore_init_fn(restore_model): + return [v.initializer for v in restore_model.layers[-1].variables] + + if h5py is None: + self.skipTest('This test only works with h5py.') + + self._new_layer_weight_loading_test_template( + _save_graph_model, _restore_graph_model, + _restore_init_fn, by_name=True) + + @test_util.run_in_graph_and_eager_modes() + def test_weight_loading_graph_model_added_no_weight_layer(self): + def _save_graph_model(): + a = keras.layers.Input(shape=(2,)) + x = keras.layers.Dense(3, name='first')(a) + b = keras.layers.Dense(1, name='second')(x) + return keras.models.Model(a, b) + def _restore_graph_model(): + a = keras.layers.Input(shape=(2,)) + x = keras.layers.Dense(3, name='first')(a) + y = keras.layers.Dropout(rate=0.1)(x) + b = keras.layers.Dense(1, name='second')(y) + return keras.models.Model(a, b) + def _restore_init_fn(restore_model): + del restore_model # unused + return [] + if h5py is None: + self.skipTest('This test only works with h5py.') + + self._new_layer_weight_loading_test_template( + _save_graph_model, _restore_graph_model, + _restore_init_fn, by_name=False) + + @test_util.run_in_graph_and_eager_modes() + def test_weight_loading_subclassed_model_added_layer(self): + + class SubclassedModelRestore(training.Model): + + def __init__(self): + super(SubclassedModelRestore, self).__init__() + self.x_layer = keras.layers.Dense(3) + self.y_layer = keras.layers.Dense(3) + self.b_layer = keras.layers.Dense(1) + + def call(self, a): + return self.b_layer(self.y_layer(self.x_layer(a))) + + def _restore_init_fn(restore_model): + return [v.initializer for v in restore_model.y_layer.variables] + + self._new_layer_weight_loading_test_template( + SubclassedModel, SubclassedModelRestore, + _restore_init_fn, by_name=False) + if __name__ == '__main__': test.main() diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py index 146e8fdac9a..5f9b3e8c7d7 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training.py +++ b/tensorflow/python/keras/_impl/keras/engine/training.py @@ -584,6 +584,7 @@ class Model(Network): updates=updates, name='train_function', **self._function_kwargs) + self._post_build_cleanup() def _make_test_function(self): if not hasattr(self, 'test_function'): @@ -601,6 +602,7 @@ class Model(Network): updates=self.state_updates + self.metrics_updates, name='test_function', **self._function_kwargs) + self._post_build_cleanup() def _make_predict_function(self): if not hasattr(self, 'predict_function'): @@ -619,6 +621,7 @@ class Model(Network): updates=self.state_updates, name='predict_function', **kwargs) + self._post_build_cleanup() def _standardize_user_data(self, x, diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py index bc8698f235a..295ad47f6be 100644 --- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py +++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py @@ -19,7 +19,6 @@ from __future__ import division from __future__ import print_function import os -import tempfile import numpy as np import six @@ -420,8 +419,6 @@ class ModelSubclassingTest(test.TestCase): @test_util.run_in_graph_and_eager_modes() def test_saving(self): - if h5py is None: - return # Skip test if models cannot be saved. num_classes = (2, 3) num_samples = 100 @@ -437,20 +434,30 @@ class ModelSubclassingTest(test.TestCase): model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0) y_ref_1, y_ref_2 = model.predict([x1, x2]) - fd, fname = tempfile.mkstemp('.h5') - model.save_weights(fname) + tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt') + model.save_weights(tf_format_name) + if h5py is not None: + hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5') + model.save_weights(hdf5_format_name) model = MultiIOTestModel(num_classes=num_classes, use_bn=True) - # need to build the model before loading weights - # (otherwise no weights to load) - model._set_inputs([x1, x2]) - model.load_weights(fname) + + if h5py is not None: + with self.assertRaises(ValueError): + model.load_weights(hdf5_format_name) + + model.load_weights(tf_format_name) y1, y2 = model.predict([x1, x2]) self.assertAllClose(y_ref_1, y1, atol=1e-5) self.assertAllClose(y_ref_2, y2, atol=1e-5) - os.close(fd) - os.remove(fname) + + if h5py is not None: + model.load_weights(hdf5_format_name) + + y1, y2 = model.predict([x1, x2]) + self.assertAllClose(y_ref_1, y1, atol=1e-5) + self.assertAllClose(y_ref_2, y2, atol=1e-5) @test_util.run_in_graph_and_eager_modes() def test_summary(self): diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable_utils.py index 4769e15120c..13bd89d9072 100644 --- a/tensorflow/python/training/checkpointable_utils.py +++ b/tensorflow/python/training/checkpointable_utils.py @@ -616,11 +616,10 @@ class CheckpointableSaver(object): # Allow passing in a weak reference to avoid reference cycles when # `Checkpointable` objects save themselves. self._root_checkpointable_ref = root_checkpointable - if not context.executing_eagerly(): - with ops.device("/cpu:0"): - self._file_prefix_placeholder = constant_op.constant("model") - else: - self._file_prefix_placeholder = None + # The file prefix placeholder is created lazily when graph building (and not + # at all when executing eagerly) to avoid creating ops in the constructor + # (when they may never be necessary). + self._file_prefix_placeholder = None # Op caching for save self._object_graph_feed_tensor = None @@ -778,6 +777,9 @@ class CheckpointableSaver(object): return InitializationOnlyStatus(self._root_checkpointable) in_graph_mode = not context.executing_eagerly() if in_graph_mode: + if self._file_prefix_placeholder is None: + with ops.device("/cpu:0"): + self._file_prefix_placeholder = constant_op.constant("model") file_prefix_tensor = self._file_prefix_placeholder file_prefix_feed_dict = {self._file_prefix_placeholder: save_path} else: diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt index cdf2da712f3..cee76bdc1db 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt @@ -239,7 +239,7 @@ tf_class { } member_method { name: "save_weights" - argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], " + argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "set_weights" diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt index 5c2c29e60fe..02718cb5f9e 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt @@ -256,7 +256,7 @@ tf_class { } member_method { name: "save_weights" - argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], " + argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "set_weights" diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt index b3f3f169227..dd78384005f 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt @@ -239,7 +239,7 @@ tf_class { } member_method { name: "save_weights" - argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], " + argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "set_weights" diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt index 4ac6811bace..9fcb03f47e7 100644 --- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt @@ -256,7 +256,7 @@ tf_class { } member_method { name: "save_weights" - argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], " + argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "set_weights" diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh index 9627475d84f..8e8b2191e5c 100755 --- a/tensorflow/tools/ci_build/ci_sanity.sh +++ b/tensorflow/tools/ci_build/ci_sanity.sh @@ -101,6 +101,7 @@ do_pylint() { "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\ "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\ "^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\ +"^tensorflow/python/keras/_impl/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\ "^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\ "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned" From 06d5ca2ae097c08c886759dd27f90b19e4c6f49d Mon Sep 17 00:00:00 2001 From: Andy Kernahan Date: Mon, 23 Apr 2018 20:32:35 +0100 Subject: [PATCH 0611/1734] Fix tfcompile module label. (#16582) --- tensorflow/docs_src/performance/xla/tfcompile.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md index f57ca3948dd..8521d7eacb4 100644 --- a/tensorflow/docs_src/performance/xla/tfcompile.md +++ b/tensorflow/docs_src/performance/xla/tfcompile.md @@ -86,7 +86,7 @@ code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into executable code. ```build -load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library") +load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") # Use the tf_library macro to compile your graph into executable code. tf_library( @@ -258,8 +258,8 @@ file. ```build # Example of linking your binary -# Also see //third_party/tensorflow/compiler/aot/tests/BUILD -load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library") +# Also see //tensorflow/compiler/aot/tests/BUILD +load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") # The same tf_library call from step 2 above. tf_library( From d9191b881fc283d93a8eaa4961c5e16f2205311f Mon Sep 17 00:00:00 2001 From: Martin Wicke Date: Mon, 23 Apr 2018 12:35:35 -0700 Subject: [PATCH 0612/1734] Re-enable metrics_test, increase sharding. PiperOrigin-RevId: 193967074 --- tensorflow/python/kernel_tests/BUILD | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 8628ca5d401..ebbec39cf3a 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -2877,11 +2877,8 @@ tf_py_test( "//tensorflow/python:random_ops", "//tensorflow/python:variables", ], - shard_count = 10, - tags = [ - "no_windows_gpu", - "noasan", - ], + shard_count = 20, + tags = ["no_windows_gpu"], ) tf_py_test( From 594c1c60f523ba4dd45545876e850ca7281be73a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 13:12:58 -0700 Subject: [PATCH 0613/1734] Entropy bottleneck class. PiperOrigin-RevId: 193972549 --- tensorflow/contrib/BUILD | 2 +- tensorflow/contrib/cmake/python_modules.txt | 1 + .../contrib/cmake/tf_core_kernels.cmake | 1 + tensorflow/contrib/coder/BUILD | 56 +- tensorflow/contrib/coder/__init__.py | 3 +- .../coder/python/layers/entropybottleneck.py | 697 ++++++++++++++++++ .../python/layers/entropybottleneck_test.py | 315 ++++++++ 7 files changed, 1071 insertions(+), 4 deletions(-) create mode 100644 tensorflow/contrib/coder/python/layers/entropybottleneck.py create mode 100644 tensorflow/contrib/coder/python/layers/entropybottleneck_test.py diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index d28392a62c2..8edb8654b83 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -29,7 +29,7 @@ py_library( "//tensorflow/contrib/cloud:cloud_py", "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/cluster_resolver:cluster_resolver_py", - "//tensorflow/contrib/coder:coder_ops_py", + "//tensorflow/contrib/coder:coder_py", "//tensorflow/contrib/compiler:compiler_py", "//tensorflow/contrib/copy_graph:copy_graph_py", "//tensorflow/contrib/crf:crf_py", diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index fbcdf7e753d..932a6eeeaad 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -144,6 +144,7 @@ tensorflow/contrib/coder tensorflow/contrib/coder/kernels tensorflow/contrib/coder/ops tensorflow/contrib/coder/python +tensorflow/contrib/coder/python/layers tensorflow/contrib/coder/python/ops tensorflow/contrib/compiler tensorflow/contrib/copy_graph diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake index ed018b4fed8..376496b33f4 100644 --- a/tensorflow/contrib/cmake/tf_core_kernels.cmake +++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake @@ -63,6 +63,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS) "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc" diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD index 9ca4ce8a9c7..a146460a9cd 100644 --- a/tensorflow/contrib/coder/BUILD +++ b/tensorflow/contrib/coder/BUILD @@ -1,5 +1,5 @@ # Description: -# Contains entropy coding related modules. +# Contains tools related to data compression. package(default_visibility = [ "//learning/brain:__subpackages__", @@ -152,10 +152,21 @@ tf_gen_op_wrapper_py( deps = [":coder_ops_op_lib"], ) +py_library( + name = "coder_py", + srcs = [ + "__init__.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + ":entropybottleneck_py", + ], +) + tf_custom_op_py_library( name = "coder_ops_py", srcs = [ - "__init__.py", "python/ops/coder_ops.py", ], dso = [ @@ -186,3 +197,44 @@ tf_py_test( ], main = "python/ops/coder_ops_test.py", ) + +py_library( + name = "entropybottleneck_py", + srcs = [ + "python/layers/entropybottleneck.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:functional_ops", + "//tensorflow/python:init_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:nn", + "//tensorflow/python:ops", + "//tensorflow/python:random_ops", + "//tensorflow/python:state_ops", + "//tensorflow/python:summary_ops", + "//tensorflow/python:tensor_shape", + "//tensorflow/python:variable_scope", + "//tensorflow/python/eager:context", + "//tensorflow/python/keras:engine", + "//third_party/py/numpy", + ], +) + +tf_py_test( + name = "entropybottleneck_py_test", + srcs = [ + "python/layers/entropybottleneck_test.py", + ], + additional_deps = [ + ":entropybottleneck_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:variables", + "//tensorflow/python:training", + ], + main = "python/layers/entropybottleneck_test.py", +) diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py index b7e663e6f13..99b8ac7595e 100644 --- a/tensorflow/contrib/coder/__init__.py +++ b/tensorflow/contrib/coder/__init__.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Entropy code operations.""" +"""Data compression tools.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # pylint: disable=wildcard-import +from tensorflow.contrib.coder.python.layers.entropybottleneck import * from tensorflow.contrib.coder.python.ops.coder_ops import * # pylint: enable=wildcard-import diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py new file mode 100644 index 00000000000..f039cb0f526 --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py @@ -0,0 +1,697 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Entropy bottleneck layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.ops import coder_ops + +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras._impl.keras import engine +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import functional_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.summary import summary + + +class EntropyBottleneck(engine.Layer): + """Entropy bottleneck layer. + + This layer can be used to model the entropy (the amount of information + conveyed) of the tensor passing through it. During training, this can be used + to impose a (soft) entropy constraint on its activations, limiting the amount + of information flowing through the layer. Note that this is distinct from + other types of bottlenecks, which reduce the dimensionality of the space, for + example. Dimensionality reduction does not limit the amount of information, + and does not enable efficient data compression per se. + + After training, this layer can be used to compress any input tensor to a + string, which may be written to a file, and to decompress a file which it + previously generated back to a reconstructed tensor (possibly on a different + machine having access to the same model checkpoint). The entropies estimated + during training or evaluation are approximately equal to the average length of + the strings in bits. + + The layer implements a flexible probability density model to estimate entropy, + which is described in the appendix of the paper (please cite the paper if you + use this code for scientific work): + + "Variational image compression with a scale hyperprior" + + Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston + + https://arxiv.org/abs/1802.01436 + + The layer assumes that the input tensor is at least 2D, with a batch dimension + at the beginning and a channel dimension as specified by `data_format`. The + layer trains an independent probability density model for each channel, but + assumes that across all other dimensions, the inputs are i.i.d. (independent + and identically distributed). Because the entropy (and hence, average + codelength) is a function of the densities, this assumption may have a direct + effect on the compression performance. + + Because data compression always involves discretization, the outputs of the + layer are generally only approximations of its inputs. During training, + discretization is modeled using additive uniform noise to ensure + differentiability. The entropies computed during training are differential + entropies. During evaluation, the data is actually quantized, and the + entropies are discrete (Shannon entropies). To make sure the approximated + tensor values are good enough for practical purposes, the training phase must + be used to balance the quality of the approximation with the entropy, by + adding an entropy term to the training loss, as in the following example. + + Here, we use the entropy bottleneck to compress the latent representation of + an autoencoder. The data vectors `x` in this case are 4D tensors in + `'channels_last'` format (for example, 16x16 pixel grayscale images). + + The layer always produces exactly one auxiliary loss and one update op which + are only significant for compression and decompression. To use the compression + feature, the auxiliary loss must be minimized during or after training. After + that, the update op must be executed at least once. Here, we simply attach + them to the main training step. + + Training: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + entropy_bottleneck = EntropyBottleneck() + y_, likelihoods = entropy_bottleneck(y, training=True) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element + # (note that taking the natural logarithm and dividing by `log(2)` is + # equivalent to taking base-2 logarithms): + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + + # Minimize loss and auxiliary loss, and execute update op. + main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) + main_step = optimizer.minimize(main_loss) + # 1e-2 is a good starting point for the learning rate of the auxiliary loss, + # assuming Adam is used. + aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) + aux_step = optimizer.minimize(entropy_bottleneck.losses[0]) + step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) + ``` + + Evaluation: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + y_, likelihoods = EntropyBottleneck()(y, training=False) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element: + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + ``` + + To be able to compress the bottleneck tensor and decompress it in a different + session, or on a different machine, you need three items: + - The compressed representations stored as strings. + - The shape of the bottleneck for these string representations as a `Tensor`, + as well as the number of channels of the bottleneck at graph construction + time. + - The checkpoint of the trained model that was used for compression. Note: + It is crucial that the auxiliary loss produced by this layer is minimized + during or after training, and that the update op is run after training and + minimization of the auxiliary loss, but *before* the checkpoint is saved. + + Compression: + ``` + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + strings = EntropyBottleneck().compress(y) + shape = tf.shape(y)[1:] + ``` + + Decompression: + ``` + strings = tf.placeholder(tf.string, shape=[None]) + shape = tf.placeholder(tf.int32, shape=[3]) + entropy_bottleneck = EntropyBottleneck(dtype=tf.float32) + y_ = entropy_bottleneck.decompress(strings, shape, channels=5) + x_ = backward_transform(y_) + ``` + Here, we assumed that the tensor produced by the forward transform has 5 + channels. + + The above four use cases can also be implemented within the same session (i.e. + on the same `EntropyBottleneck` instance), for testing purposes, etc., by + calling the object more than once. + + Arguments: + init_scale: Float. A scaling factor determining the initial width of the + probability densities. This should be chosen big enough so that the + range of values of the layer inputs roughly falls within the interval + [`-init_scale`, `init_scale`] at the beginning of training. + filters: An iterable of ints, giving the number of filters at each layer of + the density model. Generally, the more filters and layers, the more + expressive is the density model in terms of modeling more complicated + distributions of the layer inputs. For details, refer to the paper + referenced above. The default is `[3, 3, 3]`, which should be sufficient + for most practical purposes. + tail_mass: Float, between 0 and 1. The bottleneck layer automatically + determines the range of input values that should be represented based on + their frequency of occurrence. Values occurring in the tails of the + distributions will be clipped to that range during compression. + `tail_mass` determines the amount of probability mass in the tails which + is cut off in the worst case. For example, the default value of `1e-9` + means that at most 1 in a billion input samples will be clipped to the + range. + optimize_integer_offset: Boolean. Typically, the input values of this layer + are floats, which means that quantization during evaluation can be + performed with an arbitrary offset. By default, the layer determines that + offset automatically. In special situations, such as when it is known that + the layer will receive only full integer values during evaluation, it can + be desirable to set this argument to `False` instead, in order to always + quantize to full integer values. + likelihood_bound: Float. If positive, the returned likelihood values are + ensured to be greater than or equal to this value. This prevents very + large gradients with a typical entropy loss (defaults to 1e-9). + range_coder_precision: Integer, between 1 and 16. The precision of the range + coder used for compression and decompression. This trades off computation + speed with compression efficiency, where 16 is the slowest but most + efficient setting. Choosing lower values may increase the average + codelength slightly compared to the estimated entropies. + data_format: Either `'channels_first'` or `'channels_last'` (default). + trainable: Boolean. Whether the layer should be trained. + name: String. The name of the layer. + dtype: Default dtype of the layer's parameters (default of `None` means use + the type of the first input). + + Read-only properties: + init_scale: See above. + filters: See above. + tail_mass: See above. + optimize_integer_offset: See above. + likelihood_bound: See above. + range_coder_precision: See above. + data_format: See above. + name: String. See above. + dtype: See above. + trainable_variables: List of trainable variables. + non_trainable_variables: List of non-trainable variables. + variables: List of all variables of this layer, trainable and non-trainable. + updates: List of update ops of this layer. Always contains exactly one + update op, which must be run once after the last training step, before + `compress` or `decompress` is used. + losses: List of losses added by this layer. Always contains exactly one + auxiliary loss, which must be added to the training loss. + + Mutable properties: + trainable: Boolean. Whether the layer should be trained. + input_spec: Optional `InputSpec` object specifying the constraints on inputs + that can be accepted by the layer. + """ + + def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9, + optimize_integer_offset=True, likelihood_bound=1e-9, + range_coder_precision=16, data_format="channels_last", **kwargs): + super(EntropyBottleneck, self).__init__(**kwargs) + self._init_scale = float(init_scale) + self._filters = tuple(int(f) for f in filters) + self._tail_mass = float(tail_mass) + if not 0 < self.tail_mass < 1: + raise ValueError( + "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass)) + self._optimize_integer_offset = bool(optimize_integer_offset) + self._likelihood_bound = float(likelihood_bound) + self._range_coder_precision = int(range_coder_precision) + self._data_format = data_format + self._channel_axis(2) # trigger ValueError early + self.input_spec = engine.InputSpec(min_ndim=2) + + @property + def init_scale(self): + return self._init_scale + + @property + def filters(self): + return self._filters + + @property + def tail_mass(self): + return self._tail_mass + + @property + def optimize_integer_offset(self): + return self._optimize_integer_offset + + @property + def likelihood_bound(self): + return self._likelihood_bound + + @property + def range_coder_precision(self): + return self._range_coder_precision + + @property + def data_format(self): + return self._data_format + + def _channel_axis(self, ndim): + try: + return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format] + except KeyError: + raise ValueError("Unsupported `data_format` for {} layer: {}.".format( + self.__class__.__name__, self.data_format)) + + def _logits_cumulative(self, inputs, stop_gradient): + """Evaluate logits of the cumulative densities. + + Args: + inputs: The values at which to evaluate the cumulative densities, expected + to be a `Tensor` of shape `(channels, 1, batch)`. + stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so + that the gradient of the output with respect to the density model + parameters is disconnected (the gradient with respect to `inputs` is + left untouched). + + Returns: + A `Tensor` of the same shape as `inputs`, containing the logits of the + cumulative densities evaluated at the given inputs. + """ + logits = inputs + + for i in range(len(self.filters) + 1): + matrix = self._matrices[i] + if stop_gradient: + matrix = array_ops.stop_gradient(matrix) + logits = math_ops.matmul(matrix, logits) + + bias = self._biases[i] + if stop_gradient: + bias = array_ops.stop_gradient(bias) + logits += bias + + if i < len(self._factors): + factor = self._factors[i] + if stop_gradient: + factor = array_ops.stop_gradient(factor) + logits += factor * math_ops.tanh(logits) + + return logits + + def build(self, input_shape): + """Builds the layer. + + Creates the variables for the network modeling the densities, creates the + auxiliary loss estimating the median and tail quantiles of the densities, + and then uses that to create the probability mass functions and the update + op that produces the discrete cumulative density functions used by the range + coder. + + Args: + input_shape: Shape of the input tensor, used to get the number of + channels. + + Raises: + ValueError: if `input_shape` doesn't specify the length of the channel + dimension. + """ + input_shape = tensor_shape.TensorShape(input_shape) + channel_axis = self._channel_axis(input_shape.ndims) + channels = input_shape[channel_axis].value + if channels is None: + raise ValueError("The channel dimension of the inputs must be defined.") + self.input_spec = engine.InputSpec( + ndim=input_shape.ndims, axes={channel_axis: channels}) + filters = (1,) + self.filters + (1,) + scale = self.init_scale ** (1 / (len(self.filters) + 1)) + + # Create variables. + self._matrices = [] + self._biases = [] + self._factors = [] + for i in range(len(self.filters) + 1): + init = np.log(np.expm1(1 / scale / filters[i + 1])) + matrix = self.add_variable( + "matrix_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], filters[i]), + initializer=init_ops.Constant(init)) + matrix = nn.softplus(matrix) + self._matrices.append(matrix) + + bias = self.add_variable( + "bias_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.RandomUniform(-.5, .5)) + self._biases.append(bias) + + if i < len(self.filters): + factor = self.add_variable( + "factor_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.Zeros()) + factor = math_ops.tanh(factor) + self._factors.append(factor) + + # To figure out what range of the densities to sample, we need to compute + # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we + # can't take inverses of the cumulative directly, we make it an optimization + # problem: + # `quantiles = argmin(|logit(cumulative) - target|)` + # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`. + # Taking the logit (inverse of sigmoid) of the cumulative makes the + # representation of the right target more numerically stable. + + # Numerically stable way of computing logits of `tail_mass / 2` + # and `1 - tail_mass / 2`. + target = np.log(2 / self.tail_mass - 1) + # Compute lower and upper tail quantile as well as median. + target = constant_op.constant([-target, 0, target], dtype=self.dtype) + + def quantiles_initializer(shape, dtype=None, partition_info=None): + del partition_info # unused + assert tuple(shape[1:]) == (1, 3) + init = constant_op.constant( + [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype) + return array_ops.tile(init, (shape[0], 1, 1)) + + quantiles = self.add_variable( + "quantiles", shape=(channels, 1, 3), dtype=self.dtype, + initializer=quantiles_initializer) + logits = self._logits_cumulative(quantiles, stop_gradient=True) + loss = math_ops.reduce_sum(abs(logits - target)) + self.add_loss(loss, inputs=None) + + # Save medians for `call`, `compress`, and `decompress`. + self._medians = quantiles[:, :, 1:2] + if not self.optimize_integer_offset: + self._medians = math_ops.round(self._medians) + + # Largest distance observed between lower tail quantile and median, + # or between median and upper tail quantile. + minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1]) + maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians) + minmax = math_ops.maximum(minima, maxima) + minmax = math_ops.ceil(minmax) + minmax = math_ops.maximum(minmax, 1) + + # Sample the density up to `minmax` around the median. + samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype) + samples += self._medians + + half = constant_op.constant(.5, dtype=self.dtype) + # We strip the sigmoid from the end here, so we can use the special rule + # below to only compute differences in the left tail of the sigmoid. + # This increases numerical stability (see explanation in `call`). + lower = self._logits_cumulative(samples - half, stop_gradient=True) + upper = self._logits_cumulative(samples + half, stop_gradient=True) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + # Add tail masses to first and last bin of pmf, as we clip values for + # compression, meaning that out-of-range values get mapped to these bins. + pmf = array_ops.concat([ + math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]), + pmf[:, 0, 1:-1], + math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]), + ], axis=-1) + self._pmf = pmf + + cdf = coder_ops.pmf_to_quantized_cdf( + pmf, precision=self.range_coder_precision) + def cdf_getter(*args, **kwargs): + del args, kwargs # ignored + return variable_scope.get_variable( + "quantized_cdf", dtype=dtypes.int32, initializer=cdf, + trainable=False, validate_shape=False, collections=()) + # Need to provide a fake shape here since add_variable insists on it. + self._quantized_cdf = self.add_variable( + "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32, + getter=cdf_getter, trainable=False) + + update_op = state_ops.assign( + self._quantized_cdf, cdf, validate_shape=False) + self.add_update(update_op, inputs=None) + + super(EntropyBottleneck, self).build(input_shape) + + def call(self, inputs, training): + """Pass a tensor through the bottleneck. + + Args: + inputs: The tensor to be passed through the bottleneck. + training: Boolean. If `True`, returns a differentiable approximation of + the inputs, and their likelihoods under the modeled probability + densities. If `False`, returns the quantized inputs and their + likelihoods under the corresponding probability mass function. These + quantities can't be used for training, as they are not differentiable, + but represent actual compression more closely. + + Returns: + values: `Tensor` with the same shape as `inputs` containing the perturbed + or quantized input values. + likelihood: `Tensor` with the same shape as `inputs` containing the + likelihood of `values` under the modeled probability distributions. + + Raises: + ValueError: if `inputs` has different `dtype` or number of channels than + a previous set of inputs the model was invoked with earlier. + """ + inputs = ops.convert_to_tensor(inputs) + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + half = constant_op.constant(.5, dtype=self.dtype) + + # Convert to (channels, 1, batch) format by commuting channels to front + # and then collapsing. + order = list(range(ndim)) + order.pop(channel_axis) + order.insert(0, channel_axis) + values = array_ops.transpose(inputs, order) + shape = array_ops.shape(values) + values = array_ops.reshape(values, (shape[0], 1, -1)) + + # Add noise or quantize. + if training: + noise = random_ops.random_uniform(array_ops.shape(values), -half, half) + values = math_ops.add_n([values, noise]) + elif self.optimize_integer_offset: + values = math_ops.round(values - self._medians) + self._medians + else: + values = math_ops.round(values) + + # Evaluate densities. + # We can use the special rule below to only compute differences in the left + # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 + # for large x, 0 for small x. Subtracting two numbers close to 0 can be done + # with much higher precision than subtracting two numbers close to 1. + lower = self._logits_cumulative(values - half, stop_gradient=False) + upper = self._logits_cumulative(values + half, stop_gradient=False) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + sign = array_ops.stop_gradient(sign) + likelihood = abs( + math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + if self.likelihood_bound > 0: + likelihood_bound = constant_op.constant( + self.likelihood_bound, dtype=self.dtype) + # TODO(jballe): Override gradients. + likelihood = math_ops.maximum(likelihood, likelihood_bound) + + # Convert back to input tensor shape. + order = list(range(1, ndim)) + order.insert(channel_axis, 0) + values = array_ops.reshape(values, shape) + values = array_ops.transpose(values, order) + likelihood = array_ops.reshape(likelihood, shape) + likelihood = array_ops.transpose(likelihood, order) + + if not context.executing_eagerly(): + values_shape, likelihood_shape = self.compute_output_shape(inputs.shape) + values.set_shape(values_shape) + likelihood.set_shape(likelihood_shape) + + return values, likelihood + + def compress(self, inputs): + """Compress inputs and store their binary representations into strings. + + Args: + inputs: `Tensor` with values to be compressed. + + Returns: + String `Tensor` vector containing the compressed representation of each + batch element of `inputs`. + """ + with ops.name_scope(self._name_scope()): + inputs = ops.convert_to_tensor(inputs) + if not self.built: + # Check input assumptions set before layer building, e.g. input rank. + self._assert_input_compatibility(inputs) + if self.dtype is None: + self._dtype = inputs.dtype.base_dtype.name + self.build(inputs.shape) + + # Check input assumptions set after layer building, e.g. input shape. + if not context.executing_eagerly(): + self._assert_input_compatibility(inputs) + + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + # Bring inputs to the right range by centering the range on the medians. + half = constant_op.constant(.5, dtype=self.dtype) + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians + # Expand offsets to input dimensions and add to inputs. + values = inputs + offsets[slices[:-1]] + + # Clip to range and cast to integers. Because we have added .5 above, and + # all values are positive, the cast effectively implements rounding. + values = math_ops.maximum(values, half) + values = math_ops.minimum( + values, math_ops.cast(num_levels, self.dtype) - half) + values = math_ops.cast(values, dtypes.int16) + + def loop_body(tensor): + return coder_ops.range_encode( + tensor, cdf, precision=self.range_coder_precision) + strings = functional_ops.map_fn( + loop_body, values, dtype=dtypes.string, back_prop=False) + + if not context.executing_eagerly(): + strings.set_shape(inputs.shape[:1]) + + return strings + + def decompress(self, strings, shape, channels=None): + """Decompress values from their compressed string representations. + + Args: + strings: A string `Tensor` vector containing the compressed data. + shape: A `Tensor` vector of int32 type. Contains the shape of the tensor + to be decompressed, excluding the batch dimension. + channels: Integer. Specifies the number of channels statically. Needs only + be set if the layer hasn't been built yet (i.e., this is the first input + it receives). + + Returns: + The decompressed `Tensor`. Its shape will be equal to `shape` prepended + with the batch dimension from `strings`. + + Raises: + ValueError: If the length of `shape` isn't available at graph construction + time. + """ + with ops.name_scope(self._name_scope()): + strings = ops.convert_to_tensor(strings) + shape = ops.convert_to_tensor(shape) + if self.built: + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + if channels is None: + channels = self.input_spec.axes[channel_axis] + else: + if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1): + raise ValueError("`shape` must be a vector with known length.") + ndim = shape.shape[0].value + 1 + channel_axis = self._channel_axis(ndim) + input_shape = ndim * [None] + input_shape[channel_axis] = channels + self.build(input_shape) + + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + def loop_body(string): + return coder_ops.range_decode( + string, shape, cdf, precision=self.range_coder_precision) + outputs = functional_ops.map_fn( + loop_body, strings, dtype=dtypes.int16, back_prop=False) + outputs = math_ops.cast(outputs, self.dtype) + + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = math_ops.cast(num_levels // 2, self.dtype) - medians + outputs -= offsets[slices[:-1]] + + if not context.executing_eagerly(): + outputs_shape = ndim * [None] + outputs_shape[0] = strings.shape[0] + outputs_shape[channel_axis] = channels + outputs.set_shape(outputs_shape) + + return outputs + + def visualize(self): + """Multi-channel visualization of densities as images. + + Creates and returns an image summary visualizing the current probabilty + density estimates. The image contains one row for each channel. Within each + row, the pixel intensities are proportional to probability values, and each + row is centered on the median of the corresponding distribution. + + Returns: + The created image summary. + """ + with ops.name_scope(self._name_scope()): + image = self._pmf + image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True) + image = math_ops.cast(image + .5, dtypes.uint8) + image = image[None, :, :, None] + return summary.image("pmf", image, max_outputs=1) + + def compute_output_shape(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape) + return input_shape, input_shape diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py new file mode 100644 index 00000000000..798b0234ebc --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests of EntropyBottleneck class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.layers import entropybottleneck + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class EntropyBottleneckTest(test.TestCase): + + def test_noise(self): + # Tests that the noise added is uniform noise between -0.5 and 0.5. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck() + noisy, _ = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + values = np.linspace(-50, 50, 100)[:, None] + noisy, = sess.run([noisy], {inputs: values}) + self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49)) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + + def test_quantization(self): + # Tests that inputs are quantized to full integer values, even after + # quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6) + + def test_quantization_optimized_offset(self): + # Tests that inputs are not quantized to full integer values after quantiles + # have been updated. However, the difference between input and output should + # be between -0.5 and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - quantized) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec(self): + # Tests that inputs are compressed and decompressed correctly, and quantized + # to full integer values, even after quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6) + + def test_codec_optimized_offset(self): + # Tests that inputs are compressed and decompressed correctly, and not + # quantized to full integer values after quantiles have been updated. + # However, the difference between input and output should be between -0.5 + # and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=True) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - decoded) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec_clipping(self): + # Tests that inputs are compressed and decompressed correctly, and clipped + # to the expected range. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=40) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + expected = np.clip(np.around(values), -40, 40) + self.assertAllClose(expected, decoded, rtol=0, atol=1e-6) + + def test_channels_last(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channels in the last dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(7, 5, 3, 2)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_channels_first(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channel dimension right after the batch dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(2, 3, 5, 7)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_compress(self): + # Test compression and decompression, and produce test data for + # `test_decompress`. If you set the constant at the end to `True`, this test + # will fail and the log will contain the new test data. + inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), init_scale=2) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5 + bitstrings, quantized_cdf, decoded = sess.run( + [bitstrings, layer._quantized_cdf, decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + # Set this constant to `True` to log new test data for `test_decompress`. + if False: # pylint:disable=using-constant-test + assert False, (bitstrings, quantized_cdf, decoded) + + # Data generated by `test_compress`. + # pylint:disable=g-inconsistent-quotes,bad-whitespace + bitstrings = np.array([ + b'\x1e\xbag}\xc2\xdaN\x8b\xbd.', + b'\x8dF\xf0%\x1cv\xccllW' + ], dtype=object) + + quantized_cdf = np.array([ + [ 0, 15636, 22324, 30145, 38278, 65536], + [ 0, 19482, 26927, 35052, 42904, 65535], + [ 0, 21093, 28769, 36919, 44578, 65536] + ], dtype=np.int32) + + expected = np.array([ + [[-2., 1., 0., -2., -1., -2., -2., -2., 2., -1.], + [ 1., 2., 1., 0., -2., -2., 1., 2., 0., 1.], + [ 2., 0., -2., 2., 0., -1., -2., 0., 2., 0.]], + [[ 1., 2., 0., -1., 1., 2., 1., 1., 2., -2.], + [ 2., -1., -1., 0., -1., 2., 0., 2., -2., 2.], + [ 2., -2., -2., -1., -2., 1., -2., 0., 0., 0.]] + ], dtype=np.float32) + # pylint:enable=g-inconsistent-quotes,bad-whitespace + + def test_decompress(self): + # Test that decompression of values compressed with a previous version + # works, i.e. that the file format doesn't change across revisions. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32) + quantized_cdf = array_ops.placeholder(dtypes.int32) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), dtype=dtypes.float32) + layer.build(self.expected.shape) + layer._quantized_cdf = quantized_cdf + decoded = layer.decompress(bitstrings, input_shape[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + decoded, = sess.run([decoded], { + bitstrings: self.bitstrings, input_shape: self.expected.shape, + quantized_cdf: self.quantized_cdf}) + self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6) + + def test_build_decompress(self): + # Test that layer can be built when `decompress` is the first call to it. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32, shape=[3]) + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.decompress(bitstrings, input_shape[1:], channels=5) + self.assertTrue(layer.built) + + def test_pmf_normalization(self): + # Test that probability mass functions are normalized correctly. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + pmf, = sess.run([layer._pmf]) + self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6) + + def test_visualize(self): + # Test that summary op can be constructed. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + summary = layer.visualize() + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run([summary]) + + def test_normalization(self): + # Test that densities are normalized correctly. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(filters=(2,)) + _, likelihood = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + x = np.repeat(np.arange(-200, 201), 1000)[:, None] + likelihood, = sess.run([likelihood], {inputs: x}) + self.assertEqual(x.shape, likelihood.shape) + integral = np.sum(likelihood) * .001 + self.assertAllClose(1, integral, rtol=0, atol=1e-4) + + def test_entropy_estimates(self): + # Test that entropy estimates match actual range coding. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + filters=(2, 3), data_format="channels_last") + _, likelihood = layer(inputs, training=True) + diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + _, likelihood = layer(inputs, training=False) + disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + bitstrings = layer.compress(inputs) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + diff_entropy, disc_entropy, bitstrings = sess.run( + [diff_entropy, disc_entropy, bitstrings], + {inputs: np.random.normal(size=(1, 10000, 1))}) + codelength = 8 * sum(len(bitstring) for bitstring in bitstrings) + self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0) + self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0) + self.assertGreater(codelength, disc_entropy) + + +if __name__ == "__main__": + test.main() From 8e544335e15029ccccbe743ee0fefaa344b62e4e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 13:28:01 -0700 Subject: [PATCH 0614/1734] Remove unused function from FunctionDefLibrary. PiperOrigin-RevId: 193974712 --- .../grappler/optimizers/function_optimizer.cc | 126 +++++++++++++++--- .../grappler/optimizers/function_optimizer.h | 6 +- .../optimizers/function_optimizer_test.cc | 32 ++--- .../grappler/optimizers/meta_optimizer.cc | 6 +- tensorflow/core/grappler/utils/functions.cc | 12 +- tensorflow/core/grappler/utils/functions.h | 40 ++++-- .../core/grappler/utils/functions_test.cc | 8 +- 7 files changed, 163 insertions(+), 67 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index d008a9719fe..47e7dc0a969 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op_def.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/graph/graph_constructor.h" @@ -75,12 +76,10 @@ string UniqueSpecializedFunctionName(const FunctionDef& func, class FunctionOptimizerContext { public: - explicit FunctionOptimizerContext(const GrapplerItem& item, - RewriterConfig::Toggle opt_level) - : opt_level_(opt_level), - function_library_(FunctionLibraryDefinition(OpRegistry::Global(), - item.graph.library())) { - InitializeInlinedFunctions(item); + explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level, + const GrapplerItem& item) + : function_library_(OpRegistry::Global(), item.graph.library()) { + InitializeInlinedFunctions(opt_level, item); } const FunctionLibraryDefinition& function_library() const { @@ -101,8 +100,9 @@ class FunctionOptimizerContext { } private: - void InitializeInlinedFunctions(const GrapplerItem& item) { - bool aggressive = opt_level_ == RewriterConfig::AGGRESSIVE; + void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level, + const GrapplerItem& item) { + bool aggressive = opt_level == RewriterConfig::AGGRESSIVE; for (const FunctionDef& func : item.graph.library().function()) { // Can't create IdentityN nodes with no input or output: skip these @@ -120,7 +120,6 @@ class FunctionOptimizerContext { } } - RewriterConfig::Toggle opt_level_; FunctionLibraryDefinition function_library_; // Functions that can be inlined into optimized graph. std::unordered_map inlined_functions_; @@ -128,9 +127,93 @@ class FunctionOptimizerContext { TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext); }; +// Return trimmed FunctionDefLibrary with functions that are reachable from +// the optimized graph. +FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib, + const GraphDef& optimized_graph) { + // Functions that are reachable from the optimized graph. + std::unordered_set keep_funcs; + + std::vector func_queue; + func_queue.reserve(flib.num_functions()); + + // Add registered and not already processed functions to the queue by name. + const auto add_to_func_queue = [&](const string& func_name) { + const FunctionDef* func = flib.Find(func_name); + if (func && keep_funcs.find(func_name) == keep_funcs.end()) { + func_queue.push_back(func); + } + }; + + // Find all the functions that are reachable from the given node. + const auto add_node_to_func_queue = [&](const NodeDef& node) { + // Node itself can be a call to the function. + add_to_func_queue(node.op()); + + // Or node can have an attribute referencing a function. + for (const auto& attr : node.attr()) { + const auto& attr_value = attr.second; + + // 1. AttrValue.func + if (attr_value.has_func()) { + add_to_func_queue(attr_value.func().name()); + } + + // 2. AttrValue.ListValue.func + if (attr_value.has_list()) { + for (const auto& func : attr_value.list().func()) { + add_to_func_queue(func.name()); + } + } + } + }; + + // Add all functions that are directly called from the optimized graph. + const auto& graph_nodes = optimized_graph.node(); + std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue); + + // Process all reachable functions. + while (!func_queue.empty()) { + const FunctionDef* func = func_queue.back(); + func_queue.pop_back(); + + const string& func_name = func->signature().name(); + keep_funcs.insert(func_name); + + // Find all the functions called from the function body. + const auto& func_body = func->node_def(); + std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue); + + // Check if the function has a registered gradient. + const string grad_func_name = flib.FindGradient(func_name); + if (!grad_func_name.empty()) add_to_func_queue(grad_func_name); + } + + FunctionDefLibrary lib; + for (const string& func_name : keep_funcs) { + const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name)); + *lib.add_function() = *func; + + const string grad_func_name = flib.FindGradient(func_name); + if (!grad_func_name.empty()) { + GradientDef* gd = lib.add_gradient(); + gd->set_function_name(func_name); + gd->set_gradient_func(grad_func_name); + } + } + + VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions (" + << static_cast(keep_funcs.size() - flib.num_functions()) << ")"; + + return lib; +} + Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, FunctionOptimizerContext* ctx, GraphDef* optimized_graph) { + VLOG(2) << "Specialize function instantiation: " + << SummarizeNodeDef(func_node); + const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -141,20 +224,20 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func, TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); // TODO(ezhulenev): Push down const inputs and known input shapes. - FunctionDef specialized; - TF_RETURN_IF_ERROR(MakeSpecializedFunctionDef(item, flib, &specialized)); + FunctionDef specialized_func; + TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func)); // Find a name for specialized function. const string specialized_func_name = UniqueSpecializedFunctionName(func, func_node, flib); - specialized.mutable_signature()->set_name(specialized_func_name); - auto* specialized_attr = specialized.mutable_attr(); + specialized_func.mutable_signature()->set_name(specialized_func_name); + auto* specialized_attr = specialized_func.mutable_attr(); (*specialized_attr)[kGrapplerSpecializedFuncAttr].set_b(true); // Add specialized function to the library. TF_RETURN_IF_ERROR( - ctx->mutable_function_library().AddFunctionDef(specialized)); + ctx->mutable_function_library().AddFunctionDef(specialized_func)); // Add a function call node for the specialized function. NodeDef* specialized_func_node = optimized_graph->add_node(); @@ -226,6 +309,8 @@ Status HookInlinedFunctionOutputs( Status InlineFunction(const NodeDef& func_node, const FunctionDef& func, const FunctionOptimizerContext& ctx, GraphDef* optimized_graph) { + VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node); + const std::unordered_map func_attr( func_node.attr().begin(), func_node.attr().end()); @@ -359,6 +444,8 @@ class SymbolicGradientEnv { Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, GraphDef* inlined_graph) { + VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node); + GraphDef graph_def; // Create a node to anchor the gradient inputs @@ -454,13 +541,16 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env, Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, GraphDef* optimized_graph) { + VLOG(1) << "Optimize Grappler item: id=" << item.id; + // Nothing to do here. if (item.graph.library().function_size() == 0) { + VLOG(3) << "Skip Grappler item with empty function library"; *optimized_graph = item.graph; return Status::OK(); } - FunctionOptimizerContext ctx(item, opt_level_); + FunctionOptimizerContext ctx(opt_level_, item); SymbolicGradientEnv env(item.graph.versions().producer(), item.graph.library()); @@ -506,9 +596,11 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph->add_node() = node; } - // TODO(bsteiner): trim the library to remove unused function definitions *optimized_graph->mutable_versions() = item.graph.versions(); - *optimized_graph->mutable_library() = ctx.function_library().ToProto(); + *optimized_graph->mutable_library() = + options_.enable_trim_function_library + ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph) + : ctx.function_library().ToProto(); return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h index c555fadf83a..e307b4e533f 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.h +++ b/tensorflow/core/grappler/optimizers/function_optimizer.h @@ -26,8 +26,9 @@ namespace grappler { // operations to make the overall graph more efficient. class FunctionOptimizer : public GraphOptimizer { public: - FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {} - ~FunctionOptimizer() override {} + explicit FunctionOptimizer(RewriterConfig::Toggle opt_level) + : opt_level_(opt_level) {} + ~FunctionOptimizer() override = default; string name() const override { return "function_optimizer"; }; @@ -44,6 +45,7 @@ class FunctionOptimizer : public GraphOptimizer { bool enable_function_inlining = true; bool enable_function_specialization = true; bool enable_symbolic_gradient_inlining = true; + bool enable_trim_function_library = true; }; RewriterConfig::Toggle opt_level_; diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc index fb006d48688..6147e8a27c0 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc @@ -31,20 +31,8 @@ constexpr char kDevice[] = "/device:CPU:0"; class FunctionOptimizerTest : public GrapplerTest { protected: - void DisableAll(FunctionOptimizer* optimizer) { - optimizer->options_.enable_function_inlining = false; + void DisableFunctionSpecialization(FunctionOptimizer* optimizer) { optimizer->options_.enable_function_specialization = false; - optimizer->options_.enable_symbolic_gradient_inlining = false; - } - - void EnableOnlyFunctionInlining(FunctionOptimizer* optimizer) { - DisableAll(optimizer); - optimizer->options_.enable_function_inlining = true; - } - - void EnableOnlyFunctionSpecialization(FunctionOptimizer* optimizer) { - DisableAll(optimizer); - optimizer->options_.enable_function_specialization = true; } }; @@ -352,7 +340,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithoutInput) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); - EnableOnlyFunctionInlining(&optimizer); + DisableFunctionSpecialization(&optimizer); // do not specialize noinline func const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( @@ -626,14 +614,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { using test::function::NDef; FunctionOptimizer optimizer(RewriterConfig::DEFAULT); - EnableOnlyFunctionSpecialization(&optimizer); - // Mark XTimesTwo as noinline + // Mark XTimesTwo as noinline. FunctionDef x_times_two = test::function::XTimesTwo(); (*x_times_two.mutable_attr())["_noinline"].set_b(true); std::vector function_library = {x_times_two}; - // Build a graph to compute y = XTimesTwo(x) + // Build a graph to compute y = XTimesTwo(x). GrapplerItem item; item.graph = test::function::GDef( {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice), @@ -644,12 +631,13 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { GraphDef output; TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output)); - // Make sure that specialized function was added to the library - EXPECT_EQ(2, output.library().function_size()); + // Make sure that specialized function was added to the library and original + // function was removed. + EXPECT_EQ(1, output.library().function_size()); EXPECT_EQ("XTimesTwo_specialized_for_y", - output.library().function(1).signature().name()); + output.library().function(0).signature().name()); - // And 'y' node is calling specialized function + // And 'y' node is calling specialized function. int count = 0; for (const NodeDef& node : output.node()) { if (node.name() == "y" && count++) { @@ -658,7 +646,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) { } EXPECT_EQ(1, count); - // And that graph evaluation yields the same result + // And that graph evaluation yields the same result. Tensor pi = test::AsScalar(3.14f); item.fetch = {"z"}; item.feed.emplace_back("x", pi); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 558b8a77e8a..335fb403f18 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -219,11 +219,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, if (already_optimized) { TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph)); ReassignColocation(optimized_graph); - // Make sure that the optimizers preserved the graph version and library. - DCHECK_GE(optimized_graph->library().function_size(), - item.graph.library().function_size()); - DCHECK_GE(optimized_graph->library().gradient_size(), - item.graph.library().gradient_size()); + // Make sure that the optimizers preserved the graph version. DCHECK_EQ(optimized_graph->versions().producer(), item.graph.versions().producer()); } diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc index 638fe1999a6..790809bc670 100644 --- a/tensorflow/core/grappler/utils/functions.cc +++ b/tensorflow/core/grappler/utils/functions.cc @@ -545,6 +545,12 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func, return Status::OK(); } +Status MakeGrapplerFunctionItem(const FunctionDef& func, + const FunctionLibraryDefinition& flib, + GrapplerFunctionItem* item) { + return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item); +} + // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Status RegisterGrapplerFunctionConnectivity( @@ -560,9 +566,9 @@ Status RegisterGrapplerFunctionConnectivity( return Status::OK(); } -Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func) { +Status MakeFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func) { func->mutable_signature()->set_name(item.id); func->mutable_signature()->set_is_stateful(item.is_stateful()); diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h index ab369bcad7c..5e8b6c69601 100644 --- a/tensorflow/core/grappler/utils/functions.h +++ b/tensorflow/core/grappler/utils/functions.h @@ -38,7 +38,8 @@ using AttrValueMap = std::unordered_map; // function body in place of function inputs and a resolved input data type. struct InputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence inputs of - // different data types + // different data types. + // TODO(ezhulenev): Support type parametrized inputs? string input_name; // name of the function input argument DataType data_type; // input data type bool is_ref; // if true, inputs are required to be refs @@ -53,7 +54,8 @@ struct InputArgExpansion { // tensors of a function body nodes and a resolved output data type struct OutputArgExpansion { // TODO(ezhulenev): Add support for functions with tensor sequence outputs of - // different data types + // different data types. + // TODO(ezhulenev): Support type parametrized outputs? string output_name; // name of the function output argument DataType data_type; // output data type bool is_ref; // if true, outputs are refs @@ -186,13 +188,6 @@ bool HasParametrizedBody(const FunctionDef& func); // Check if function has parametrized type or body. bool IsParametrized(const FunctionDef& func); -// Make a GrapplerFunctionItem from the function definition and attributes. -// Return error if the given function def cannot be converted. -Status MakeGrapplerFunctionItem( - const FunctionDef& func, - const std::unordered_map& func_instantiation_attr, - const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); - // Register GrapplerFunctionItem input arg expansion and function body outputs // in the GrapplerFunctionConnectivity. Use function library definition to // lookup function body nodes output names and ranges. @@ -200,11 +195,28 @@ Status RegisterGrapplerFunctionConnectivity( const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib, GrapplerFunctionConnectivity* connectivity); -// Make a specialized FunctionDef from the GrapplerFunctionItem. Use function -// library definition to lookup function body nodes output names and ranges. -Status MakeSpecializedFunctionDef(const GrapplerFunctionItem& item, - const FunctionLibraryDefinition& flib, - FunctionDef* func); +// Make a GrapplerFunctionItem from the function definition and function +// instantiation attributes (caller node attributes). Returns error if the given +// function def cannot be converted (e.g. not all attributes are defined). +Status MakeGrapplerFunctionItem( + const FunctionDef& func, + const std::unordered_map& func_instantiation_attr, + const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item); + +// Make a GrapplerFunction item from the function definition. Function must be +// fully defined (no type or body parametrization). +// TODO(ezhulenev): Support parametrized functions without fully defined +// instantiation attributes? Do we ever want to optimize parametrized function +// without specializing it to it's instantiation attributes (at least types)? +Status MakeGrapplerFunctionItem(const FunctionDef& func, + const FunctionLibraryDefinition& flib, + GrapplerFunctionItem* item); + +// Make a FunctionDef from the GrapplerFunctionItem. Use function library +// definition to lookup function body nodes output names and ranges. +Status MakeFunctionDef(const GrapplerFunctionItem& item, + const FunctionLibraryDefinition& flib, + FunctionDef* func); } // end namespace grappler } // end namespace tensorflow diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc index 54d235a8a46..6dfd49b9438 100644 --- a/tensorflow/core/grappler/utils/functions_test.cc +++ b/tensorflow/core/grappler/utils/functions_test.cc @@ -524,7 +524,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) { EXPECT_EQ("two", cast.input(0)); } -TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { +TEST_F(FunctionsTest, MakeFunctionDef) { const Tensor kTwo = test::AsScalar(2); FunctionDef func = FunctionDefHelper::Define( // Name @@ -550,7 +550,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item)); FunctionDef specialized; - TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); // Input and output types are resolved based on instantiation attributes. EXPECT_EQ("x", specialized.signature().input_arg(0).name()); @@ -573,7 +573,7 @@ TEST_F(FunctionsTest, MakeSpecializedFunctionDef) { EXPECT_EQ(2, count); } -TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) { +TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) { using test::function::NDef; FunctionDef mul_func = FunctionDefHelper::Create( @@ -606,7 +606,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeSpecializedFunctionDef) { // Replace function body with identity function item.SwapFunctionBody(std::move(id_func_body)); FunctionDef specialized; - TF_EXPECT_OK(MakeSpecializedFunctionDef(item, flib, &specialized)); + TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized)); // Check that graph body was updated. int count = 0; From 19ee0605b6eadb516703c37b7ba38e7122a6c51f Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Mon, 23 Apr 2018 13:43:13 -0700 Subject: [PATCH 0615/1734] Updating freeze_graph dependencies. PiperOrigin-RevId: 193977096 --- tensorflow/python/BUILD | 1 + tensorflow/python/tools/BUILD | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 698e2a28bf1..9dc03d7cdbc 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -70,6 +70,7 @@ py_library( srcs_version = "PY2AND3", visibility = [ "//tensorflow:__pkg__", + "//tensorflow/python/tools:__pkg__", ], deps = [ ":array_ops", diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD index 84d20f8e362..6c34b6aaf31 100644 --- a/tensorflow/python/tools/BUILD +++ b/tensorflow/python/tools/BUILD @@ -38,9 +38,9 @@ py_library( deps = [ ":saved_model_utils", "//tensorflow/core:protos_all_py", - "//tensorflow/python", # TODO(b/34059704): remove when fixed "//tensorflow/python:client", "//tensorflow/python:framework", + "//tensorflow/python:no_contrib", # TODO(b/34059704): remove when fixed "//tensorflow/python:parsing_ops", "//tensorflow/python:platform", "//tensorflow/python:training", From 955c1edb2f92871597aaf74f5684da4d22843064 Mon Sep 17 00:00:00 2001 From: zhangyaobit Date: Mon, 23 Apr 2018 13:46:26 -0700 Subject: [PATCH 0616/1734] Update layout_optimizer.cc Place data format op on CPU:0. --- tensorflow/core/grappler/optimizers/layout_optimizer.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc index 561226f9454..8fb30d116de 100644 --- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc @@ -919,6 +919,7 @@ class NodeProcessor : public GraphProcessor { ParseNodeName(input_name, &port); if (IsHostMemory(*input, port)) { parsed_name.type = "CPU"; + parsed_name.id = 0; device = DeviceNameUtils::ParsedNameToString(parsed_name); } } From 105c7df01b12b77bc17909cfb4a0d0c0aff87571 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 13:44:57 -0700 Subject: [PATCH 0617/1734] More relaxed size checking for TransposeConv, and miscellaneous bug fixes. PiperOrigin-RevId: 193977375 --- .../internal/optimized/optimized_ops.h | 3 + .../internal/reference/reference_ops.h | 3 + .../propagate_fixed_sizes.cc | 56 +++++++------------ .../resolve_constant_binary.cc | 7 ++- .../resolve_multiply_by_zero.cc | 5 ++ 5 files changed, 36 insertions(+), 38 deletions(-) diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 49ce1133d34..d585bcca0e5 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -5774,6 +5774,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { gemmlowp::ScopedProfilingLabel label("Pad"); + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index d1d4f54f86a..ae295cc8b58 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -3065,6 +3065,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& left_paddings, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index ba244cf5ef5..79464926331 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -168,7 +168,9 @@ void ProcessConvOperator(Model* model, ConvOperator* op) { return; } const auto& input_shape = input_array.shape(); - CHECK_EQ(input_shape.dimensions_count(), 4); + CHECK(input_shape.dimensions_count() == 4) + << "Conv ops require 4D inputs. Input array \"" << op->inputs[0] + << "\" is " << input_shape.dimensions_count() << "D."; const auto& weights_array = model->GetArray(op->inputs[1]); // Yield until weights dims have been resolved. @@ -249,12 +251,6 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; - CHECK(weights_shape.dims(0) == 1 && weights_shape.dims(3) == 1) - << "TransposeConv weights dimensions must begin and end with 1. Input " - "weights \"" - << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " - << toco::ShapeToString(weights_shape) << "."; - // Compute padding const int kheight = weights_shape.dims(1); const int kwidth = weights_shape.dims(2); @@ -269,9 +265,7 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { LOG(FATAL) << "TransposeConv only supports SAME or VALID padding"; } - // VALIDATE OUTPUT SHAPE - // Compute the output shape from the input and weights shapes to verify it - // agrees with the specified output shape. + // VALIDATE some dimensions and set the output shape. const auto& input_array = model->GetArray(op->inputs[TransposeConvOperator::DATA_INPUT]); if (!input_array.has_shape()) { @@ -283,31 +277,13 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << "TransposeConv input shape must have 4 dimensions. Input \"" << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; + CHECK_EQ(input_shape.dims(3), weights_shape.dims(0)) + << "Input shape depth and weight depth do not agree"; - // Compute output shape - const int input_width = input_shape.dims(2); - const int input_height = input_shape.dims(1); - int output_height = op->stride_height * (input_height - 1); - int output_width = op->stride_width * (input_width - 1); - if (op->padding.type == PaddingType::kValid) { - output_height += kheight; - output_width += kwidth; - } else if (op->padding.type == PaddingType::kSame) { - output_height += 1; - output_width += 1; - } - - CHECK(specified_output_shape_array.GetBuffer().data == - std::vector({input_shape.dims(0), output_height, output_width, - weights_shape.dims(3)})) - << "Specified output shape: " << ShapeToString(output_array.shape()) - << ", does not agree with shape computed from input data and weights: [" - << input_shape.dims(0) << ", " << output_height << ", " << output_width - << ", " << weights_shape.dims(3) << "]."; - - // SUCCESS: Set the op's output shape according to the specified output shape. - *(output_array.mutable_shape()->mutable_dims()) = + // Set the output shape according to the specified output shape. + std::vector const& specified_output_shape = specified_output_shape_array.GetBuffer().data; + *(output_array.mutable_shape()->mutable_dims()) = specified_output_shape; } void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) { @@ -1179,6 +1155,11 @@ void ProcessRankOperator(Model* model, RankOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1200,6 +1181,11 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1230,10 +1216,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) { } Shape shape = input_array.shape(); - if (shape.dimensions_count() == 0) { - // Convert 0D scalars to 1D scalars of shape {1}. - shape.mutable_dims()->push_back(1); - } if (!stacked_shape) { stacked_shape.reset(new Shape(shape)); } else { diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc index 5e779f67652..6e78653fad2 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc @@ -233,7 +233,12 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) { } // Check that input data types agree. - CHECK(input0_array.data_type == input1_array.data_type); + CHECK(input0_array.data_type == input1_array.data_type) + << "Dissimilar data types given to op outputting \"" + << binary_op->outputs[0] << "\". 0:\"" << binary_op->inputs[0] << "\"(" + << static_cast(input0_array.data_type) << ") 1:\"" + << binary_op->inputs[1] << "\"(" + << static_cast(input1_array.data_type) << ")."; // Do the actual constants propagation EvaluateBinaryOperatorOnConstantInputs(model, binary_op); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc index 37beb41dfc5..4bb1217828a 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc @@ -60,6 +60,11 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) { const auto& output_array_name = mul_op->outputs[0]; auto& output_array = model->GetArray(output_array_name); + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return false; + } + // Yield if the output shape is not known yet. if (!output_array.has_shape()) { return false; From aaf1e32d53e1b473e9d1700afba71662e28150ff Mon Sep 17 00:00:00 2001 From: zhangyaobit Date: Mon, 23 Apr 2018 13:49:22 -0700 Subject: [PATCH 0618/1734] Update layout_optimizer_test.cc Place data format op on CPU:0. --- tensorflow/core/grappler/optimizers/layout_optimizer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc index 260347b0e85..b913f2b0041 100644 --- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc @@ -36,7 +36,7 @@ class LayoutOptimizerTest : public ::testing::Test { DeviceProperties device_properties; device_properties.set_type("GPU"); device_properties.mutable_environment()->insert({"architecture", "6"}); - virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}})); + virtual_cluster_.reset(new VirtualCluster({{"/GPU:1", device_properties}})); } Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size, From 9ad432781fce95a397d7d4a8ce506932160b83f1 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Mon, 23 Apr 2018 14:00:28 -0700 Subject: [PATCH 0619/1734] Update install_linux.md --- tensorflow/docs_src/install/install_linux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index f19f827e255..63b8eb30e91 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -48,7 +48,7 @@ must be installed on your system: Toolkit. * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface. This library provides advanced profiling support. To install this library, - issue the following command for CUDA Toolkit >= 8.0: + issue the following command for CUDA Toolkit >= 9.0:
     $ sudo apt-get install cuda-command-line-tools

From 5db49b64f244b89870aff89a13309796ae060620 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 14:05:40 -0700
Subject: [PATCH 0620/1734] [XLA] Add xla_builder and xla_computation to every
 test targets that will be migrated.

PiperOrigin-RevId: 193981015
---
 tensorflow/compiler/xla/tests/BUILD | 89 +++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 1f90a44d8ba..25bbde1677c 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -153,6 +153,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -191,6 +193,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -288,6 +291,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -311,6 +316,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -330,6 +337,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -371,6 +380,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -390,6 +401,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -442,6 +454,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -461,6 +475,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -478,6 +494,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -514,6 +532,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -535,6 +555,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -554,6 +576,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -578,6 +602,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -604,6 +630,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -670,6 +697,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -715,6 +744,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -738,6 +769,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -760,6 +793,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -813,6 +848,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -836,6 +873,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -898,6 +937,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -923,6 +964,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -963,6 +1006,8 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1038,6 +1083,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1196,6 +1243,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1235,6 +1284,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1256,6 +1307,8 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1294,6 +1347,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1310,6 +1365,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1335,6 +1392,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1355,6 +1414,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1428,6 +1489,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1472,6 +1535,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1514,6 +1579,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1532,6 +1599,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1595,6 +1664,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1608,6 +1679,8 @@ xla_test(
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1629,6 +1702,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1713,6 +1788,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1740,6 +1817,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1777,6 +1856,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1802,6 +1883,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1860,6 +1943,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1886,6 +1971,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1982,6 +2069,8 @@ xla_test(
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],

From 01bc05347f430039c8efec10131b795178c9e302 Mon Sep 17 00:00:00 2001
From: Igor Saprykin 
Date: Mon, 23 Apr 2018 14:20:49 -0700
Subject: [PATCH 0621/1734] Run the canned estimator test on 2 GPUs as well.

PiperOrigin-RevId: 193983700
---
 .../contrib/distribute/python/estimator_integration_test.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index c5a520ab5ae..34410a64701 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -61,7 +61,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           mode=['graph'],
           distribution=[
               combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus
           ]))
   def test_complete_flow_with_mode(self, distribution):
     label_dimension = 2

From d3b60b2210521a71961f675cb69bbe148b21b8da Mon Sep 17 00:00:00 2001
From: Yifei Feng <1192265+yifeif@users.noreply.github.com>
Date: Mon, 23 Apr 2018 14:24:11 -0700
Subject: [PATCH 0622/1734] Reapply #18446.

---
 tensorflow/python/framework/test_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index f954b9d6c73..5a8bc437273 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1014,6 +1014,8 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
+      config.graph_options.rewrite_options.arithmetic_optimization = (
+          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:

From 1d54aeb8e1f89ac0d13eacca1eac863476f4ee0a Mon Sep 17 00:00:00 2001
From: Benoit Steiner 
Date: Mon, 23 Apr 2018 14:23:11 -0700
Subject: [PATCH 0623/1734] Simplified shape inference for queues

PiperOrigin-RevId: 193984176
---
 .../core/grappler/costs/graph_properties.cc      | 16 ++++------------
 .../core/grappler/costs/graph_properties.h       |  2 +-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index a0125ce3426..ca30ad83a0c 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1080,7 +1080,7 @@ Status GraphProperties::PropagateShapes(
       // fanout of the queues, we need to manually propagate the shapes from
       // enqueue node to the corresponding queue.
       TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
-                                        shape_refiner, relax, new_shapes));
+                                        shape_refiner, new_shapes));
     }
   } while (!new_shapes->empty() &&
            num_resource_iterations++ < max_resource_iterations);
@@ -1094,7 +1094,7 @@ Status GraphProperties::PropagateShapes(
 
 Status GraphProperties::UpdateResource(
     const Node* qnode, const std::unordered_set& queue_inputs,
-    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes) {
+    SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) {
   // Proceed only if qnode is a queue or an Enter with queue input.
   if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
     return Status::OK();
@@ -1108,9 +1108,6 @@ Status GraphProperties::UpdateResource(
   // Merge all inputs into the enqueue node, regardless of which phase we
   // are in.
   std::vector queue_shapes_and_types;
-  if (queue_handle_data) {
-    queue_shapes_and_types = *queue_handle_data;
-  }
   for (const auto& node : queue_inputs) {
     auto ctx = shape_refiner->GetContext(node);
     if (!ctx) {
@@ -1126,13 +1123,8 @@ Status GraphProperties::UpdateResource(
       if (queue_shapes_and_types.empty()) {
         queue_shapes_and_types = shapes_and_types;
       } else {
-        if (relax) {
-          TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        } else {
-          TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-        }
+        TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+            shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 4c3f3f5f533..a4e3031db14 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -93,7 +93,7 @@ class GraphProperties {
   // enqueue its fanout in 'new_shapes'.
   static Status UpdateResource(
       const Node* qnode, const std::unordered_set& queue_inputs,
-      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes);
+      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes);
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.

From d12244894aa0cdd068b46ebed407ced1915272b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 14:39:53 -0700
Subject: [PATCH 0624/1734] Use %zu instead of %lu since size_t is not an
 unsigned long on 32-bit.

PiperOrigin-RevId: 193987261
---
 tensorflow/contrib/lite/optional_debug_tools.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
index e0a09101171..dfdd80ea8a4 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(Interpreter* interpreter) {
-  printf("Interpreter has %lu tensors and %lu nodes\n",
+  printf("Interpreter has %zu tensors and %zu nodes\n",
          interpreter->tensors_size(), interpreter->nodes_size());
   printf("Inputs:");
   PrintIntVector(interpreter->inputs());

From f97fec3cf5d361103d21989b78a74dd1820620d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 14:58:58 -0700
Subject: [PATCH 0625/1734] Refactoring triangular_solve.cc to use the new
 common utility functions.

PiperOrigin-RevId: 193990473
---
 .../compiler/tf2xla/lib/triangular_solve.cc   | 82 ++++++-------------
 1 file changed, 25 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 7f72a6073df..9bf5821b54a 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -83,15 +83,6 @@ xla::StatusOr TriangularSolve(
         block_size);
   }
 
-  // Returns [b1, b2, ... , bn, indices[0], indices[1]].
-  auto prepend_batch_dims = [&](std::array indices) {
-    std::vector output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
   // is true, otherwise returns its argument.
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
@@ -108,11 +99,12 @@ xla::StatusOr TriangularSolve(
       std::unique_ptr sub = builder->CreateSubBuilder(
           tensorflow::strings::StrCat("trsm_base_", k));
 
-      auto a_param =
-          sub->Parameter(0,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({k, k})),
-                         "a");
+      auto a_param = sub->Parameter(
+          0,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, {k, k})),
+          "a");
 
       std::array b_lastd;
       if (left_side) {
@@ -120,11 +112,12 @@ xla::StatusOr TriangularSolve(
       } else {
         b_lastd = {m, k};
       }
-      auto b_param =
-          sub->Parameter(1,
-                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims(b_lastd)),
-                         "b");
+      auto b_param = sub->Parameter(
+          1,
+          xla::ShapeUtil::MakeShape(
+              b_shape->element_type(),
+              PrependMajorDims(sub.get(), batch_dimensions, b_lastd)),
+          "b");
 
       // We use a left-looking subroutine on the block diagonal in some common
       // cases, while falling back to a recursive call in unsupported cases. The
@@ -380,14 +373,6 @@ xla::StatusOr TriangularSolveLeftLooking(
     batch_dimensions.push_back(a_size);
   }
 
-  auto prepend_batch_dims = [&](std::array indices) {
-    std::vector output(ndims);
-    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
-    std::copy(indices.begin(), indices.end(),
-              output.begin() + batch_dimensions.size());
-    return output;
-  };
-
   auto maybe_conj = [&](xla::ComputationBuilder* builder,
                         xla::ComputationDataHandle x) {
     auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
@@ -479,30 +464,6 @@ xla::StatusOr TriangularSolveLeftLooking(
     auto body_b = bodyb->GetTupleElement(input_tuple, 3);
     auto zero = bodyb->ConstantR0(0);
 
-    // Set up some helper functions.
-    auto prepend_zeros = [&](std::array starts) {
-      auto zero = bodyb->Reshape(bodyb->ConstantR0(0), {1});
-      std::vector padded_starts(ndims, zero);
-      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
-      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
-      return bodyb->ConcatInDim(padded_starts, 0);
-    };
-
-    auto dynamic_slice = [&](xla::ComputationDataHandle x,
-                             std::array starts,
-                             std::array sizes) {
-      auto padded_starts = prepend_zeros(starts);
-      auto padded_sizes = prepend_batch_dims(sizes);
-      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
-    };
-
-    auto update = [&](xla::ComputationDataHandle x,
-                      xla::ComputationDataHandle update,
-                      std::array starts) {
-      auto padded_starts = prepend_zeros(starts);
-      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
-    };
-
     // We'd like to implement this:
     //   if transpose_a:
     //     a_row = T(a[..., i+1:, i:i+1])
@@ -516,22 +477,29 @@ xla::StatusOr TriangularSolveLeftLooking(
     // all zeros and use that as zero-padding (doing unnecessary FLOPs).
     xla::ComputationDataHandle a_row;
     if (transpose_a) {
-      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {zero, i}, {m, 1}));
     } else {
-      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
+      TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                         {i, zero}, {1, m}));
     }
     TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
                                                 /*transpose_x=*/transpose_a,
                                                 /*transpose_y=*/false,
                                                 /*conjugate_x=*/conjugate_a,
                                                 /*conjugate_y=*/false));
-    auto result_row =
-        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+    TF_ASSIGN_OR_RETURN(
+        auto result_row_slice,
+        DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n}));
+    auto result_row = bodyb->Sub(result_row_slice, b_update);
 
     // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
-    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                            {i, i}, {1, 1}));
     auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
-    body_out = update(body_out, div_result, {i, zero});
+    TF_ASSIGN_OR_RETURN(body_out,
+                        DynamicUpdateSliceInMinorDims(bodyb.get(), body_out,
+                                                      div_result, {i, zero}));
 
     // if transpose_a:
     //   return (i - 1, body_out, a, b)

From 6f6c75a7673cd73dfbaaba3f259ce9ab5c8086a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 15:00:43 -0700
Subject: [PATCH 0626/1734] [XLA] Redesign: migrate xla/tests/a*, xla/tests/b*.

PiperOrigin-RevId: 193990756
---
 .../xla/tests/array_elementwise_ops_test.cc   | 27 +++---
 .../compiler/xla/tests/axpy_simple_test.cc    |  5 +-
 .../tests/bad_rng_shape_validation_test.cc    | 12 +--
 .../compiler/xla/tests/bfloat16_test.cc       | 13 ++-
 .../compiler/xla/tests/binop_scaling_test.cc  | 14 ++--
 .../xla/tests/broadcast_simple_test.cc        | 82 +++++++++----------
 .../xla/tests/client_library_test_base.cc     |  8 ++
 .../xla/tests/client_library_test_base.h      |  3 +
 8 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 4b4dc6dd9d3..e8a5efe796a 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -214,7 +213,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector lhs{0xFFFFFFFF,
                           static_cast(-1),
@@ -255,7 +254,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector lhs{static_cast(0x8000000000000000LL),
                          static_cast(0x8000000000000000LL),
@@ -1332,7 +1331,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
 
 // Some Pow cases that can be implemented more efficiently.
 XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values = {1.0f, 2.0f, 3.2f, -4.0f};
   std::vector exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1360,7 +1359,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1385,7 +1384,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
   std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1410,7 +1409,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1435,7 +1434,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
   std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1460,7 +1459,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1492,7 +1491,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -1525,7 +1524,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f};
@@ -1558,7 +1557,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
   std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f};
@@ -2357,7 +2356,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) {
   // Test broadcasting in Eq comparison.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1({42, 73});
   auto m = builder.ConstantR2({{42, 73}, {42, 52}});
 
@@ -2783,7 +2782,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) {
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
 // broadcast.
 XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x_literal = Literal::CreateR1({1, 2, 3});
   auto y_literal = Literal::CreateR1({4, 5});
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index ec3b46acfec..fcd9ff55e39 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include 
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -42,7 +41,7 @@ TEST_F(AxpySimpleTest, AxTenValues) {
 }
 
 XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0(3.1415926535);
   auto x = builder.ConstantR1({});
   auto y = builder.ConstantR1({});
@@ -54,7 +53,7 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
 }
 
 TEST_F(AxpySimpleTest, AxpyTenValues) {
-  ComputationBuilder builder(client_, "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0(3.1415926535);
   auto x = builder.ConstantR1(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index e4bf1827acf..22c3394e6f3 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include 
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -34,13 +34,13 @@ namespace {
 class BadRngShapeValidationTest : public ClientLibraryTestBase {};
 
 TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0(0.0);
   auto one = builder.ConstantR0(1.0);
   Shape default_constructed;
   builder.RngUniform(zero, one, default_constructed);
 
-  StatusOr computation = builder.Build();
+  StatusOr computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
   EXPECT_THAT(computation.status().error_message(),
@@ -48,7 +48,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
 }
 
 TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0(0.0);
   auto one = builder.ConstantR0(1.0);
   Shape sans_layout;
@@ -57,7 +57,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
 
   builder.RngUniform(zero, one, sans_layout);
 
-  StatusOr computation = builder.Build();
+  StatusOr computation = builder.Build();
   ASSERT_TRUE(computation.ok());
   LOG(INFO) << computation.status();
 }
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index b853dfaa15d..4e65cf11f3f 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -19,10 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -52,7 +51,7 @@ class Bfloat16Test : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(Bfloat16Test, ScalarOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0(static_cast(2.0f));
   auto y = builder.ConstantR0(static_cast(1.0f));
   builder.Add(x, y);
@@ -62,7 +61,7 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, LogOperation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR0(static_cast(4.0f));
   builder.Log(x);
 
@@ -71,7 +70,7 @@ XLA_TEST_F(Bfloat16Test, LogOperation) {
 }
 
 XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0(static_cast(2.1f)));
 
   ComputeAndCompareR0(&builder, static_cast(-2.1f), {},
@@ -80,7 +79,7 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D(
       {{{{static_cast(1.f)}, {static_cast(2.f)}},
@@ -117,7 +116,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
 
 XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D(
       Array4D(2, 2, 2, 1, static_cast(0.0f)));
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 97fec89b63f..48203b1d40e 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -32,7 +32,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D(*alhs);
   auto rhs = builder.ConstantR2FromArray2D(*arhs);
   builder.Add(lhs, rhs);
@@ -48,7 +48,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 129);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D(*alhs);
   auto rhs = builder.ConstantR2FromArray2D(*arhs);
   builder.Add(lhs, rhs);
@@ -64,7 +64,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 9, 5);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D(*alhs);
   auto rhs = builder.ConstantR2FromArray2D(*arhs);
   builder.Add(lhs, rhs);
@@ -80,7 +80,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 257);
   auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR2FromArray2D(*alhs);
   auto rhs = builder.ConstantR2FromArray2D(*arhs);
   builder.Add(lhs, rhs);
@@ -93,7 +93,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) {
 }
 
 TEST_F(BinopScalingTest, R0PlusR2F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR0(42.0);
   auto rhs = builder.ConstantR2({
       {1.0, 2.0}, {3.0, 4.0},
@@ -109,7 +109,7 @@ TEST_F(BinopScalingTest, R0PlusR2F32) {
 }
 
 TEST_F(BinopScalingTest, R4PlusR0S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   Array4D lhs_array({
     {{{1, 2},
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 97095f1cc42..34c86e007be 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -33,10 +33,8 @@ namespace {
 
 class BroadcastSimpleTest : public ClientLibraryTestBase {
  public:
-  ComputationDataHandle BuildBinOp(HloOpcode op,
-                                   const ComputationDataHandle& lhs,
-                                   const ComputationDataHandle& rhs,
-                                   ComputationBuilder* builder) {
+  XlaOp BuildBinOp(HloOpcode op, const XlaOp& lhs, const XlaOp& rhs,
+                   XlaBuilder* builder) {
     switch (op) {
       case HloOpcode::kMinimum: {
         return builder->Min(lhs, rhs);
@@ -105,21 +103,21 @@ class BroadcastSimpleTest : public ClientLibraryTestBase {
 using ::testing::HasSubstr;
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0(1.5), {});
   ComputeAndCompareR0(&b, 1.5, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0(2.25), {2, 3});
   Array2D expected(2, 3, 2.25);
   ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle src;
+  XlaBuilder b(TestName());
+  XlaOp src;
   std::unique_ptr param_data =
       CreateR0Parameter(2.25f, /*parameter_number=*/0, /*name=*/"src",
                                /*builder=*/&b, /*data_handle=*/&src);
@@ -131,21 +129,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0(2.25), {2, 0});
   Array2D expected(2, 0);
   ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR0(2.25), {0, 2});
   Array2D expected(0, 2);
   ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1({1, 2, 3}), {2});
 
   Array2D expected(2, 3);
@@ -160,7 +158,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 // Tests implicit broadcasting of PREDs.
 XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   Array2D x_vals(2, 1);
   x_vals(0, 0) = true;
@@ -171,7 +169,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
   y_vals(1, 0, 0) = true;
   y_vals(1, 1, 0) = true;
 
-  ComputationDataHandle x, y;
+  XlaOp x, y;
   auto x_data = CreateR2Parameter(x_vals, 0, "x", &b, &x);
   auto y_data = CreateR3Parameter(y_vals, 1, "y", &b, &y);
   b.And(x, y, /*broadcast_dimensions=*/{1, 2});
@@ -186,7 +184,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1({}), {2});
 
   Array2D expected(2, 0);
@@ -194,7 +192,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Broadcast(b.ConstantR1({1, 2, 3}), {0});
 
   Array2D expected(0, 3);
@@ -209,7 +207,7 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   // broadcasting (broadcast_dimensions {1, 2}), then is added to the rhs shape
   // [2, 3, 1]. Degenerate dimension broadcasting then broadcasts the size one
   // dimensions.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2({{1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3(
@@ -247,7 +245,7 @@ class BroadcastR3ImplicitTest
 
 XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
   const R3ImplicitBroadcastSpec& spec = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape r3_shape, r3_implicit_shape;
   Array3D r3_array(spec.output_bounds[0], spec.output_bounds[1],
@@ -264,8 +262,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
 
   auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
   auto r3_parameter = builder.Parameter(1, r3_shape, "input");
-  ComputationDataHandle op =
-      BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
+  XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder);
 
   Array3D expected_array(spec.output_bounds[0], spec.output_bounds[1],
                                 spec.output_bounds[2]);
@@ -300,9 +297,9 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
 
 // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle r1h;
-  ComputationDataHandle r3h;
+  XlaBuilder b(TestName());
+  XlaOp r1h;
+  XlaOp r3h;
 
   Array3D r1d = {{{1}}, {{2}}};
   Array3D r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
@@ -319,7 +316,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -332,7 +329,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -345,7 +342,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}, {3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -358,7 +355,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}, {{3, 4}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -371,7 +368,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 =
       b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}, {{3}, {4}}}));
   auto r3 = b.ConstantLiteral(
@@ -385,7 +382,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}}}));
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -491,7 +488,7 @@ class BroadcastR2ImplicitTest
 XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   const R2ImplicitBroadcastSpec& spec = GetParam();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Operands with degenerate dimensions require implicit broadcasting:
   Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2;
@@ -517,10 +514,9 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   auto r2_implicit_parameter2 =
       builder.Parameter(2, r2_implicit_shape2, "input2");
 
-  ComputationDataHandle op1 =
+  XlaOp op1 =
       BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder);
-  ComputationDataHandle op2 =
-      BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
+  XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder);
 
   Array2D expected_array(spec.output_bounds[0], spec.output_bounds[1]);
 
@@ -547,7 +543,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
                         ::testing::ValuesIn(kR2ImplicitBroadcastTestCases));
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -558,7 +554,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1}, {2}}));
   auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}}));
   b.Add(r2, r1);
@@ -569,7 +565,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -582,7 +578,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -595,7 +591,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1 = b.ConstantR1({10, 20});
   auto r3 = b.ConstantLiteral(
       *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
@@ -608,7 +604,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1({1000, 2000});
   auto r1_1 = b.ConstantR1({100, 200});
   auto r1_2 = b.ConstantR1({10, 20});
@@ -629,7 +625,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
 }
 
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto r1_0 = b.ConstantR1({1000, 2000});
   auto r1_1 = b.ConstantR1({100, 200});
   auto r1_2 = b.ConstantR1({10, 20});
@@ -652,7 +648,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
 XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2])
   // results in a shape incompatible with the lhs [2, 3, 1].
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2({{1.0, 5.0}, {1.0, 5.0}}),
         b.ConstantLiteral(*Literal::CreateR3(
@@ -667,7 +663,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2({{1.0, 2.0}}),
         b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
@@ -680,7 +676,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Add(b.ConstantR2({{1.0, 2.0}}),
         b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 69389dae3f2..31c9e216441 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -211,6 +211,14 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
+void ClientLibraryTestBase::ComputeAndCompareR1(
+    XlaBuilder* builder, const tensorflow::core::Bitmap& expected,
+    tensorflow::gtl::ArraySlice arguments) {
+  std::unique_ptr expected_literal = Literal::CreateR1(expected);
+  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
+                                                  arguments);
+}
+
 template 
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
     BuilderT* builder, const Literal& expected,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 481d7c5c25a..85ebe29ae97 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -165,6 +165,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   void ComputeAndCompareR1(ComputationBuilder* builder,
                            const tensorflow::core::Bitmap& expected,
                            tensorflow::gtl::ArraySlice arguments);
+  void ComputeAndCompareR1(XlaBuilder* builder,
+                           const tensorflow::core::Bitmap& expected,
+                           tensorflow::gtl::ArraySlice arguments);
 
   template 
   void ComputeAndCompareR2(BuilderT* builder, const Array2D& expected,

From 9e1d93d28fe30171de3f6838028eeadb44b0d6fd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 15:15:25 -0700
Subject: [PATCH 0627/1734] Changing tf.foldl and tf.foldr to accept
 multiple/nested tensors as element/initializer.

PiperOrigin-RevId: 193993295
---
 .../kernel_tests/functional_ops_test.py       |  40 +++++++
 tensorflow/python/ops/functional_ops.py       | 100 ++++++++++++------
 2 files changed, 110 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 34fb655035d..5f48be94da0 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -70,6 +70,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(880, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldl(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldl_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldl(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldl_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -105,6 +125,26 @@ class FunctionalOpsTest(test.TestCase):
           initializer=10)
       self.assertAllEqual(1282, self.evaluate(r))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_SingleInputMultiOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array([1, -1.0])
+      r = functional_ops.foldr(lambda a, x: a + x, elems, initializer)
+      r_value = self.evaluate(r)
+
+      self.assertAllEqual(22, r_value[0])
+      self.assertAllEqual(20, r_value[1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testFoldr_MultiInputSingleOutput(self):
+    with self.test_session():
+      elems = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+      initializer = np.array(1.0)
+      r = functional_ops.foldr(lambda a, x: a + x[0] + x[1], (elems, -elems),
+                               initializer)
+      self.assertAllEqual(1, self.evaluate(r))
+
   def testFoldr_Scoped(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 161f6f36596..1b3a1e5cbc1 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -65,10 +65,20 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor to be unpacked on dimension 0.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -76,8 +86,9 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from first to last.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from first
+    to last.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -92,6 +103,11 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
   in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldl", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -107,24 +123,26 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
-      a = elems_ta.read(0)
+      a = nest.map_structure(lambda elem: elem.read(0), elems_ta)
       i = constant_op.constant(1)
     else:
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
       i = constant_op.constant(0)
 
     def compute(i, a):
-      a = fn(a, elems_ta.read(i))
+      elem_i = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a = fn(a, elem_i)
       return [i + 1, a]
+
     _, r_a = control_flow_ops.while_loop(
         lambda i, a: i < n, compute, [i, a],
         parallel_iterations=parallel_iterations,
@@ -135,6 +153,7 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 
@@ -153,10 +172,20 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is `fn(initializer, values[0]).shape`.
 
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
   Args:
     fn: The callable to be performed.
-    elems: A tensor that is unpacked into a sequence of tensors to apply `fn`.
-    initializer: (optional) The initial value for the accumulator.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be the first argument to `fn`.
+    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
+      as the initial value for the accumulator.
     parallel_iterations: (optional) The number of iterations allowed to run
       in parallel.
     back_prop: (optional) True enables support for back propagation.
@@ -164,8 +193,9 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
-    A tensor resulting from applying `fn` consecutively to the list of tensors
-    unpacked from `elems`, from last to first.
+    A tensor or (possibly nested) sequence of tensors, resulting from applying
+    `fn` consecutively to the list of tensors unpacked from `elems`, from last
+    to first.
 
   Raises:
     TypeError: if `fn` is not callable.
@@ -180,6 +210,11 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
+  def create_ta(elem):
+    return tensor_array_ops.TensorArray(
+        dtype=elem.dtype, size=n, dynamic_size=False,
+        infer_shape=True).unstack(elem)
+
   in_graph_mode = not context.executing_eagerly()
   with ops.name_scope(name, "foldr", [elems]):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -195,26 +230,30 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         varscope.set_caching_device(lambda op: op.device)
         varscope_caching_device_was_none = True
 
-    # Convert elems to tensor array.
-    elems = ops.convert_to_tensor(elems, name="elems")
-    n = array_ops.shape(elems)[0]
-    elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype, size=n,
-                                            dynamic_size=False,
-                                            infer_shape=True)
-    elems_ta = elems_ta.unstack(elems)
+    # Convert elems to tensor array. n may be known statically.
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
+    ]
+    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+
+    elems_ta = nest.map_structure(create_ta, elems)
 
     if initializer is None:
       i = n - 1
-      a = elems_ta.read(i)
+      a = nest.map_structure(lambda elem: elem.read(i), elems_ta)
     else:
       i = n
-      a = ops.convert_to_tensor(initializer)
+      a = initializer
+
     def compute(i, a):
       i -= 1
-      a = fn(a, elems_ta.read(i))
-      return [i, a]
+      elem = nest.map_structure(lambda elem: elem.read(i), elems_ta)
+      a_out = fn(a, elem)
+      return [i, a_out]
+
     _, r_a = control_flow_ops.while_loop(
-        lambda i, a: i > 0, compute, [i, a],
+        lambda i, a: i > 0,
+        compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory)
@@ -223,6 +262,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
+
     return r_a
 
 

From 01141932a9cdcd871310db141a66a47410c48ac0 Mon Sep 17 00:00:00 2001
From: Igor Ganichev 
Date: Mon, 23 Apr 2018 15:30:12 -0700
Subject: [PATCH 0628/1734] Support executing ops eagerly through XLA

The ony real change is to add GpuDeviceInfo to XlaDevice.
It is used by eager runtime to retrieve default device context.

PiperOrigin-RevId: 193995586
---
 tensorflow/compiler/jit/BUILD             |   1 +
 tensorflow/compiler/jit/xla_device.cc     |  40 +++++--
 tensorflow/compiler/jit/xla_device.h      |   8 ++
 tensorflow/compiler/jit/xla_gpu_device.cc |   9 ++
 tensorflow/compiler/tests/BUILD           |  19 +++
 tensorflow/compiler/tests/eager_test.py   | 137 ++++++++++++++++++++++
 6 files changed, 206 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/compiler/tests/eager_test.py

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 50fa95c4f32..53b124cf890 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -180,6 +180,7 @@ cc_library(
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 12f471735f6..2c2ac839b38 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include 
 #include 
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
@@ -181,9 +182,15 @@ XlaDevice::XlaDevice(const SessionOptions& options,
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      transfer_as_literal_(transfer_as_literal) {}
+      transfer_as_literal_(transfer_as_literal) {
+  VLOG(1) << "Created XLA device " << jit_device_name;
+}
 
-XlaDevice::~XlaDevice() {}
+XlaDevice::~XlaDevice() {
+  if (gpu_device_info_ != nullptr) {
+    gpu_device_info_->default_context->Unref();
+  }
+}
 
 xla::LocalClient* XlaDevice::client() const {
   // We lazily create the client because the platform commits to the
@@ -191,9 +198,8 @@ xla::LocalClient* XlaDevice::client() const {
   // don't want to do it until we get a chance to hook the platform up
   // to a simulator.
 
-  // For now GetOrCreateLocalClient always returns success when passed
-  // a non-null platform. If that changes we may have to plumb in some
-  // way to pass Status back.
+  // TODO(b/78468222): This can fail, at least when the backend is GPU and
+  // there is no GPU on the host.
   return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie();
 }
 
@@ -218,14 +224,32 @@ xla::StatusOr XlaDevice::GetStream() {
   return stream_.get();
 }
 
+Status XlaDevice::CreateAndSetGpuDeviceInfo() {
+  if (gpu_device_info_ == nullptr) {
+    TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
+    // Call GetAllocator for the side-effect of ensuring the allocator
+    // is created.
+    GetAllocator({});
+    // XlaDevice owns both gpu_device_info_ and
+    // gpu_device_info_->default_context.
+    gpu_device_info_ = absl::make_unique();
+    gpu_device_info_->stream = stream;
+    gpu_device_info_->default_context =
+        new XlaDeviceContext(stream, client(), transfer_as_literal_);
+    gpu_device_info_->default_context->Ref();
+    set_tensorflow_gpu_device_info(gpu_device_info_.get());
+  }
+
+  return Status::OK();
+}
+
 Status XlaDevice::FillContextMap(const Graph* graph,
                                  DeviceContextMap* device_context_map) {
   VLOG(1) << "XlaDevice::FillContextMap";
   device_context_map->resize(graph->num_node_ids());
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-  // Call GetAllocator for the side-effect of ensuring the allocator and
-  // XlaTensorInfoManager is created.
-  (void)GetAllocator({});
+  // Call GetAllocator for the side-effect of ensuring the allocator is created.
+  GetAllocator({});
   auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 4fe7dd8c9fa..2f5c53aea88 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -105,6 +105,10 @@ class XlaDevice : public LocalDevice {
   xla::LocalClient* client() const;
   xla::StatusOr<::perftools::gputools::Stream*> GetStream();
 
+  // If not already set, create and set GpuDeviceInfo.
+  // Not thread-safe
+  Status CreateAndSetGpuDeviceInfo();
+
  private:
   // The metadata of this XlaDevice.
   const Metadata xla_metadata_;
@@ -123,6 +127,10 @@ class XlaDevice : public LocalDevice {
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
+
+  // If set, holds default device context (that we must Unref)
+  // and its stream.
+  std::unique_ptr gpu_device_info_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index ac60423d959..a8afbf9dcd7 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -54,6 +54,15 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
     return Status::OK();
   }
+
+  // TODO(b/78468222): Uncomment after fixing this bug
+  // status = device->CreateAndSetGpuDeviceInfo();
+  // if (!status.ok()) {
+  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+  //                          " device");
+  //  return status;
+  // }
+
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 46b86c53aa6..ac2441cea0f 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -308,6 +308,25 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "eager_test",
+    size = "small",
+    srcs = ["eager_test.py"],
+    disabled_backends = [
+        # TODO(b/78199195) Support XLA CPU devices in eager runtime
+        "cpu",
+        "cpu_ondemand",
+        # TODO(b/78468222) Enable GPU backend
+        "gpu",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
new file mode 100644
index 00000000000..bdd0185dfe4
--- /dev/null
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for eager execution using XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import googletest
+
+
+class EagerTest(XLATestCase):
+
+  def testBasic(self):
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+  def testExecuteListOutputLen0(self):
+    with self.test_scope():
+      empty = constant_op.constant([], dtype=dtypes.int32)
+      result = array_ops.unstack(empty, 0)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(0, len(result))
+
+  def testExecuteListOutputLen1(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 1, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(1, len(result))
+      self.assertAllEqual([[0, 1, 2], [3, 4, 5]], result[0])
+
+  def testExecuteListOutputLen3(self):
+    with self.test_scope():
+      split_dim = constant_op.constant(1)
+      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      result = array_ops.split(value, 3, axis=split_dim)
+      self.assertTrue(isinstance(result, list))
+      self.assertEqual(3, len(result))
+      self.assertAllEqual([[0], [3]], result[0])
+      self.assertAllEqual([[1], [4]], result[1])
+      self.assertAllEqual([[2], [5]], result[2])
+
+  def testBasicGraph(self):
+    # Run some ops eagerly
+    with self.test_scope():
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+      self.assertAllEqual(15, product)
+
+    # Run some ops graphly
+    with context.graph_mode(), self.test_session() as sess:
+      with self.test_scope():
+        three = constant_op.constant(3)
+        five = constant_op.constant(5)
+        product = three * five
+        self.assertAllEqual(15, sess.run(product))
+
+  def testDegenerateSlices(self):
+    with self.test_scope():
+      npt = np.arange(1, 19, dtype=np.float32).reshape(3, 2, 3)
+      t = constant_op.constant(npt)
+      # degenerate by offering a forward interval with a negative stride
+      self.assertAllEqual(npt[0:-1:-1, :, :], t[0:-1:-1, :, :])
+      # degenerate with a reverse interval with a positive stride
+      self.assertAllEqual(npt[-1:0, :, :], t[-1:0, :, :])
+      # empty interval in every dimension
+      self.assertAllEqual(npt[-1:0, 2:2, 2:3:-1], t[-1:0, 2:2, 2:3:-1])
+
+  def testIdentity(self):
+    with self.test_scope():
+      self.assertAllEqual(2, array_ops.identity(2))
+
+  def testIdentityOnVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(True)
+      i = array_ops.identity(v)
+    self.assertAllEqual(True, i.numpy())
+
+  def testAssignAddVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      v.assign_add(2.0)
+    self.assertEqual(3.0, v.numpy())
+
+  def testGradient(self):
+    def f(x):
+      return x
+
+    with self.test_scope():
+      grad_fn = backprop.gradients_function(f)
+      self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
+
+  def testVariableGradient(self):
+    with self.test_scope():
+      v0 = resource_variable_ops.ResourceVariable(1.0)
+
+      def f():
+        x = v0 * v0
+        return x
+
+      grads = backprop.implicit_grad(f)()
+    self.assertEqual(2., grads[0][0].numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(log_device_placement=True))
+  googletest.main()

From 2f2d4745836fdcf4bf365644017a900d98bd6206 Mon Sep 17 00:00:00 2001
From: Alexandre Passos 
Date: Mon, 23 Apr 2018 15:43:20 -0700
Subject: [PATCH 0629/1734] Not using a control flow context when building
 eager functions.

PiperOrigin-RevId: 193997756
---
 tensorflow/python/eager/function.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 0f1170bb420..b924448abe6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -405,7 +405,15 @@ class GraphModeFunction(object):
       c_known_ops = set()
       c_captured_tensors = set()
 
-      def add_op_internal(op):
+      existing_op_len = len(self._graph.get_operations())
+      filtered_outputs = [x for x in self._returns if x is not None]
+      self._out_grad_placeholders = [
+          graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
+      in_gradients = gradients_impl.gradients(
+          filtered_outputs,
+          self._input_placeholders,
+          grad_ys=self._out_grad_placeholders)
+      for op in self._graph.get_operations()[existing_op_len:]:
         if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
           raise ValueError("tfe.defun cannot capture variables created without "
                            "using tf.get_variable. Op: %s" % op)
@@ -414,17 +422,6 @@ class GraphModeFunction(object):
           if i.op not in c_known_ops:
             c_captured_tensors.add(i)
 
-      c = HelperContext(add_op_internal)
-
-      with c:
-        filtered_outputs = [x for x in self._returns if x is not None]
-        self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-        in_gradients = gradients_impl.gradients(
-            filtered_outputs,
-            self._input_placeholders,
-            grad_ys=self._out_grad_placeholders)
-
     backward_outputs = tuple(
         grad for grad in _flatten(in_gradients) if grad is not None)
     output_shapes = tuple(grad.shape for grad in backward_outputs)

From c8a1eeb98ca394d0330bead37b446bce998bb3d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 15:50:56 -0700
Subject: [PATCH 0630/1734] [XLA] Redesign: migrate convolution tests.

PiperOrigin-RevId: 193998684
---
 tensorflow/compiler/xla/BUILD                 |   2 +-
 tensorflow/compiler/xla/reference_util.cc     |   6 +-
 .../convolution_dimension_numbers_test.cc     |  38 +++-
 .../xla/tests/convolution_variants_test.cc    | 167 +++++++++---------
 4 files changed, 116 insertions(+), 97 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 88f37433a55..1af9cb6d2ab 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -605,8 +605,8 @@ cc_library(
         ":util",
         ":window_util",
         ":xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index ad3a28e1193..df9dbc58308 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include 
 #include 
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -90,7 +90,7 @@ std::unique_ptr> MatmulArray2DImpl(
     Padding padding) {
   return ConvArray3DGeneralDimensionsDilated(
       lhs, rhs, kernel_stride, padding, 1, 1,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(1));
+      XlaBuilder::CreateDefaultConvDimensionNumbers(1));
 }
 
 /*static*/ std::unique_ptr>
@@ -140,7 +140,7 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
     std::pair kernel_stride, Padding padding) {
   return ConvArray4DGeneralDimensions(
       lhs, rhs, kernel_stride, padding,
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 }
 
 /* static */ std::unique_ptr>
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 896b34fb6e2..b5a42e30598 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include 
 
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,13 +34,35 @@ limitations under the License.
 namespace xla {
 namespace {
 
+StatusOr CreateConvDimensionNumbers(
+    int64 input_batch, int64 input_feature, int64 input_first_spatial,
+    int64 input_second_spatial, int64 output_batch, int64 output_feature,
+    int64 output_first_spatial, int64 output_second_spatial,
+    int64 kernel_output_feature, int64 kernel_input_feature,
+    int64 kernel_first_spatial, int64 kernel_second_spatial) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(input_batch);
+  dimension_numbers.set_input_feature_dimension(input_feature);
+  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
+  dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
+  dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
+  dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
+  dimension_numbers.set_output_batch_dimension(output_batch);
+  dimension_numbers.set_output_feature_dimension(output_feature);
+  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
+  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
+  TF_RETURN_IF_ERROR(XlaBuilder::Validate(dimension_numbers));
+  return dimension_numbers;
+}
+
 class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
 
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("input are not unique"));
@@ -49,8 +71,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
 // Tests the convolution operation with invalid weight dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0,
-                                                     2, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("weight are not unique"));
@@ -59,8 +80,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
 // Tests the convolution operation with invalid output dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0,
-                                                     1, 2, 3);
+      CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("output are not unique"));
@@ -76,14 +96,14 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
       client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(*input_array);
   auto weight =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight");
   auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid);
 
   ConvolutionDimensionNumbers dim_nums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers();
+      XlaBuilder::CreateDefaultConvDimensionNumbers();
   // Swap batch_dimension and feature_dimension.
   int64 old_input_batch_dim = dim_nums.input_batch_dimension();
   int64 old_output_batch_dim = dim_nums.output_batch_dimension();
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 9c1145def8c..50d6e25d868 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -52,7 +53,7 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D input_array(1, 1, 1, 1, {2});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -67,7 +68,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const Array4D input_array(5, 1, 1, 1, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -82,7 +83,7 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(2, 1, 3, 4);
   input_array.FillWithMultiples(1);
@@ -99,7 +100,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 2, 1, 1, {10, 1});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -114,7 +115,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 2, {1, 2});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -129,7 +130,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -144,7 +145,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -159,7 +160,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -174,7 +175,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -189,7 +190,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(
       2, 2, 2, 3, {0, 1, 2, 3, 4, 5,  6,  7,  8,  9,  0, 0,    // plane 0
@@ -210,7 +211,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -225,7 +226,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -240,7 +241,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -255,7 +256,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -270,7 +271,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -285,7 +286,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 1, {1});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -300,7 +301,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -315,7 +316,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -333,7 +334,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 2, 1, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -348,7 +349,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -363,7 +364,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(1, 1, 1, 3, {1, 2, 3});
   auto input = builder.ConstantR4FromArray4D(input_array);
@@ -378,7 +379,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(64);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -398,7 +399,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(16 * 1 * 1 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -419,7 +420,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int bs = 16;
   constexpr int kx = 2;
@@ -450,7 +451,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   constexpr int kx = 2;
   constexpr int ky = 2;
@@ -482,7 +483,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(16, 1, 8, 8);
   for (int i0 = 0; i0 < 16; ++i0) {
@@ -510,7 +511,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -536,7 +537,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(2 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -562,7 +563,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(32 * 2 * 8 * 8);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -602,7 +603,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array4D input_array(16, 16, 1, 1);
   Array4D filter_array(16, 16, 1, 1);
@@ -628,7 +629,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 4 * 6);
   std::iota(input_data.begin(), input_data.end(), 0.0);
@@ -640,14 +641,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 2, 2, {3924, 4257, 5922, 6255});
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -659,14 +660,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5});
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 3 * 4);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -682,8 +683,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   builder.ConvGeneralDilated(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1},
       /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2},
-      /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 3, 5,
                           {204, 40, 406, 60, 608,       //
@@ -693,7 +693,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -705,14 +705,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 1, 2, {23, 34});
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -724,14 +724,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-1, 2}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 1, 5, {23, 34, 45, 50, 0});
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -743,14 +743,14 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
   builder.ConvGeneral(
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {2, -1}},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D expected(1, 1, 1, 5, {0, 1, 12, 23, 34});
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -763,7 +763,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {3, 2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -775,7 +775,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
   ComputeAndCompareR4(&builder, expected, {}, error_spec_);
 }
 XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 1 * 1 * 5);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -788,7 +788,7 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
       /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{},
       /*padding=*/{{0, 0}, {-3, -2}},
       /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   // input:
   //   [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5]
@@ -821,7 +821,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
   Array4D input_array(bs, iz, iy, ix, input_data);
   Array4D filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(input_array);
   auto filter = builder.ConstantR4FromArray4D(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -854,7 +854,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
   Array4D input_array(bs, iz, iy, ix, input_data);
   Array4D filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(input_array);
   auto filter = builder.ConstantR4FromArray4D(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -887,7 +887,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
   Array4D input_array(bs, iz, iy, ix, input_data);
   Array4D filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(input_array);
   auto filter = builder.ConstantR4FromArray4D(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -920,7 +920,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   Array4D input_array(bs, iz, iy, ix, input_data);
   Array4D filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(input_array);
   auto filter = builder.ConstantR4FromArray4D(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -954,7 +954,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
   Array4D input_array(bs, iz, iy, ix, input_data);
   Array4D filter_array(oz, iz, ky, kx, kernel_data);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR4FromArray4D(input_array);
   auto filter = builder.ConstantR4FromArray4D(filter_array);
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
@@ -966,7 +966,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1010,7 +1010,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1054,7 +1054,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 2 * 3 * 1);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1095,7 +1095,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::vector input_data(1 * 2 * 3 * 2);
   std::iota(input_data.begin(), input_data.end(), 1.0);
@@ -1147,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D(
       Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1166,19 +1166,18 @@ XLA_TEST_F(ConvolutionVariantsTest,
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardInputLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D(
       Array4D(1, 1, 1, 1, /*values=*/{1}));
   auto weights = builder.ConstantR4FromArray4D(
       Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100}));
   auto mirrored_weights = builder.Rev(weights, {2, 3});
-  builder.ConvGeneralDilated(
-      gradients, mirrored_weights,
-      /*window_strides=*/{1, 1},
-      /*padding=*/{{0, 0}, {0, 3}},
-      /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+  builder.ConvGeneralDilated(gradients, mirrored_weights,
+                             /*window_strides=*/{1, 1},
+                             /*padding=*/{{0, 0}, {0, 3}},
+                             /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
+                             XlaBuilder::CreateDefaultConvDimensionNumbers());
   ComputeAndCompareR4(&builder, {{{{100, 0}}}}, {}, error_spec_);
 }
 
@@ -1187,7 +1186,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 // into
 //   BackwardInputConv([1], [1,10,100], padding=(1,1))
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D(
       Array4D(1, 1, 1, 1, /*values=*/{1}));
@@ -1208,7 +1207,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
 // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't
 // support negative padding on backward convolution yet (b/32744257).
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR4FromArray4D(
       Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3}));
@@ -1224,7 +1223,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingLessThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1240,7 +1239,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {1, 2}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4(&builder, {{{{24, 130, 240}}}}, {}, error_spec_);
@@ -1248,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest,
 
 XLA_TEST_F(ConvolutionVariantsTest,
            BackwardFilterLowPaddingGreaterThanHighPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1266,14 +1265,14 @@ XLA_TEST_F(ConvolutionVariantsTest,
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 0}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4(&builder, {{{{13, 24}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4,0
   // gradients:        100,10,1 -dilate-> 100,0,10,0,1
@@ -1293,14 +1292,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
       /*window_strides=*/{1, 1},
       /*padding=*/{{0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers());
+      XlaBuilder::CreateDefaultConvDimensionNumbers());
   builder.Transpose(forward_conv, {0, 1, 2, 3});
 
   ComputeAndCompareR4(&builder, {{{{13, 24, 130}}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients = builder.ConstantR3FromArray3D(
       Array3D(1, 1, 1, /*value=*/1));
@@ -1314,26 +1313,26 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations =
       builder.ConstantR3FromArray3D(Array3D({{{1, 2, 3, 4}}}));
   auto gradients =
       builder.ConstantR3FromArray3D(Array3D({{{100, 10, 1}}}));
-  auto forward_conv = builder.ConvGeneralDilated(
-      activations, gradients,
-      /*window_strides=*/{1},
-      /*padding=*/{{2, 1}},
-      /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
-          /*num_spatial_dims=*/1));
+  auto forward_conv =
+      builder.ConvGeneralDilated(activations, gradients,
+                                 /*window_strides=*/{1},
+                                 /*padding=*/{{2, 1}},
+                                 /*lhs_dilation=*/{}, /*rhs_dilation=*/{2},
+                                 XlaBuilder::CreateDefaultConvDimensionNumbers(
+                                     /*num_spatial_dims=*/1));
   builder.Transpose(forward_conv, {0, 1, 2});
 
   ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_);
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto gradients_flat = Literal::CreateR1({1});
   auto gradients_literal =
@@ -1357,7 +1356,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
 }
 
 XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto activations_flat = Literal::CreateR1({1, 2, 3, 4});
   auto activations_literal =
@@ -1378,7 +1377,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
       /*window_strides=*/{1, 1, 1},
       /*padding=*/{{0, 0}, {0, 0}, {2, 1}},
       /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(
+      XlaBuilder::CreateDefaultConvDimensionNumbers(
           /*num_spatial_dims=*/3));
   builder.Transpose(forward_conv, {0, 1, 2, 3, 4});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_);

From bb4a80c92105426ccf20a98c4291a1a3f8499b54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 15:56:12 -0700
Subject: [PATCH 0631/1734] Implement exporting the keys/values in a hash
 table.

PiperOrigin-RevId: 193999421
---
 tensorflow/contrib/lookup/lookup_ops_test.py  |  6 +++++
 .../core/kernels/initializable_lookup_table.h |  2 +-
 tensorflow/core/kernels/lookup_table_op.h     | 24 +++++++++++++++++++
 tensorflow/python/ops/lookup_ops.py           | 20 ++++++++++++++++
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index f681b7b1327..5d4682ec9f4 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -58,6 +58,12 @@ class HashTableOpTest(test.TestCase):
       result = output.eval()
       self.assertAllEqual([0, 1, -1], result)
 
+      exported_keys_tensor, exported_values_tensor = table.export()
+
+      self.assertItemsEqual([b"brain", b"salad", b"surgery"],
+                            exported_keys_tensor.eval())
+      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+
   def testHashTableFindHighRank(self):
     with self.test_session():
       default_val = -1
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index edb779540fb..990cbceac26 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -51,7 +51,7 @@ class InitializableLookupTable : public LookupInterface {
         "Insert not supported by InitializableLookupTable implementations");
   }
 
-  Status ExportValues(OpKernelContext* context) final {
+  Status ExportValues(OpKernelContext* context) {
     return errors::Unimplemented(
         "ExportValues not supported by InitializableLookupTable "
         "implementations");
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 29a0cc91fe0..3977f16299f 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -177,6 +177,30 @@ class HashTable : public InitializableLookupTable {
     return table_ ? table_->size() : 0;
   }
 
+  Status ExportValues(OpKernelContext* context) override {
+    if (!is_initialized_) {
+      return errors::Aborted("HashTable is not initialized.");
+    }
+
+    const int64 size = table_->size();
+
+    Tensor* keys;
+    Tensor* values;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("keys", TensorShape({size}), &keys));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output("values", TensorShape({size}), &values));
+
+    auto keys_data = keys->flat();
+    auto values_data = values->flat();
+    int64 i = 0;
+    for (auto it = table_->begin(); it != table_->end(); ++it, ++i) {
+      keys_data(i) = it->first;
+      values_data(i) = it->second;
+    }
+    return Status::OK();
+  }
+
   DataType key_dtype() const override { return DataTypeToEnum::v(); }
 
   DataType value_dtype() const override { return DataTypeToEnum::v(); }
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 6f043f60e67..0e547689cc5 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -277,7 +277,27 @@ class HashTable(InitializableLookupTableBase):
           name=scope)
 
       super(HashTable, self).__init__(table_ref, default_value, initializer)
+      self._value_shape = self._default_value.get_shape()
 
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_Export" % self._name,
+                        [self._table_ref]) as name:
+      with ops.colocate_with(self._table_ref):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self._table_ref, self._key_dtype, self._value_dtype, name=name)
+
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
+    return exported_keys, exported_values
 
 class TableInitializerBase(object):
   """Base class for lookup table initializers."""

From ff15c81e2b92ef8fb47bb15790cffd18377a4ef2 Mon Sep 17 00:00:00 2001
From: Andrew Cotter 
Date: Mon, 23 Apr 2018 15:57:02 -0700
Subject: [PATCH 0632/1734] This is a library for performing constrained
 optimization. It defines two interfaces: ConstrainedMinimizationProblem,
 which specifies a constrained optimization problem, and ConstrainedOptimizer,
 which is slightly different from a tf.train.Optimizer, mostly due to the fact
 that it is meant to optimize ConstrainedMinimizationProblems. In addition to
 these two interfaces, three ConstrainedOptimizer implementations are
 included, as well as helper functions which, given a set of candidate
 solutions, heuristically find the best candidate (to the constrained
 problem), or the best distribution over candidates.

For more details, please see our arXiv paper: "https://arxiv.org/abs/1804.06500".

PiperOrigin-RevId: 193999550
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 .../contrib/constrained_optimization/BUILD    |  91 +++
 .../constrained_optimization/README.md        | 345 ++++++++++
 .../constrained_optimization/__init__.py      |  41 ++
 .../python/candidates.py                      | 319 ++++++++++
 .../python/candidates_test.py                 |  95 +++
 .../constrained_minimization_problem.py       | 123 ++++
 .../python/constrained_optimizer.py           | 208 ++++++
 .../python/external_regret_optimizer.py       | 375 +++++++++++
 .../python/external_regret_optimizer_test.py  | 136 ++++
 .../python/swap_regret_optimizer.py           | 595 ++++++++++++++++++
 .../python/swap_regret_optimizer_test.py      | 212 +++++++
 .../python/test_util.py                       |  58 ++
 tensorflow/tools/pip_package/BUILD            |   1 +
 16 files changed, 2603 insertions(+)
 create mode 100644 tensorflow/contrib/constrained_optimization/BUILD
 create mode 100644 tensorflow/contrib/constrained_optimization/README.md
 create mode 100644 tensorflow/contrib/constrained_optimization/__init__.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/candidates.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/candidates_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
 create mode 100644 tensorflow/contrib/constrained_optimization/python/test_util.py

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 8edb8654b83..abdbdb4cd22 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -31,6 +31,7 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/constrained_optimization",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 0d163daa6e2..7f33d460dce 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -29,6 +29,7 @@ from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
+from tensorflow.contrib import constrained_optimization
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 932a6eeeaad..2554b3a6e04 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -147,6 +147,8 @@ tensorflow/contrib/coder/python
 tensorflow/contrib/coder/python/layers
 tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
+tensorflow/contrib/constrained_optimization
+tensorflow/contrib/constrained_optimization/python
 tensorflow/contrib/copy_graph
 tensorflow/contrib/copy_graph/python
 tensorflow/contrib/copy_graph/python/util
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
new file mode 100644
index 00000000000..619153df67c
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "constrained_optimization_pip",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+    ],
+)
+
+py_library(
+    name = "constrained_optimization",
+    srcs = [
+        "__init__.py",
+        "python/candidates.py",
+        "python/constrained_minimization_problem.py",
+        "python/constrained_optimizer.py",
+        "python/external_regret_optimizer.py",
+        "python/swap_regret_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "candidates_test",
+    srcs = ["python/candidates_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+# NOTE: This library can't be "testonly" since it needs to be included in the
+# pip package.
+py_library(
+    name = "test_util",
+    srcs = ["python/test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+py_test(
+    name = "external_regret_optimizer_test",
+    srcs = ["python/external_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "swap_regret_optimizer_test",
+    srcs = ["python/swap_regret_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constrained_optimization",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
new file mode 100644
index 00000000000..c65a150464e
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -0,0 +1,345 @@
+
+
+# ConstrainedOptimization (TFCO)
+
+TFCO is a library for optimizing inequality-constrained problems in TensorFlow.
+Both the objective function and the constraints are represented as Tensors,
+giving users the maximum amount of flexibility in specifying their optimization
+problems.
+
+This flexibility makes optimization considerably more difficult: on a non-convex
+problem, if one uses the "standard" approach of introducing a Lagrange
+multiplier for each constraint, and then jointly maximizing over the Lagrange
+multipliers and minimizing over the model parameters, then a stable stationary
+point might not even *exist*. Hence, in some cases, oscillation, instead of
+convergence, is inevitable.
+
+Thankfully, it turns out that even if, over the course of optimization, no
+*particular* iterate does a good job of minimizing the objective while
+satisfying the constraints, the *sequence* of iterates, on average, usually
+will. This observation suggests the following approach: at training time, we'll
+periodically snapshot the model state during optimization; then, at evaluation
+time, each time we're given a new example to evaluate, we'll sample one of the
+saved snapshots uniformly at random, and apply it to the example. This
+*stochastic model* will generally perform well, both with respect to the
+objective function, and the constraints.
+
+In fact, we can do better: it's possible to post-process the set of snapshots to
+find a distribution over at most $$m+1$$ snapshots, where $$m$$ is the number of
+constraints, that will be at least as good (and will usually be much better)
+than the (much larger) uniform distribution described above. If you're unable or
+unwilling to use a stochastic model at all, then you can instead use a heuristic
+to choose the single best snapshot.
+
+For full details, motivation, and theoretical results on the approach taken by
+this library, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+which will be referred to as [CoJiSr18] throughout the remainder of this
+document.
+
+### Proxy Constraints
+
+Imagine that we want to constrain the recall of a binary classifier to be at
+least 90%. Since the recall is proportional to the number of true positive
+classifications, which itself is a sum of indicator functions, this constraint
+is non-differentible, and therefore cannot be used in a problem that will be
+optimized using a (stochastic) gradient-based algorithm.
+
+For this and similar problems, TFCO supports so-called *proxy constraints*,
+which are (at least semi-differentiable) approximations of the original
+constraints. For example, one could create a proxy recall function by replacing
+the indicator functions with sigmoids. During optimization, each proxy
+constraint function will be penalized, with the magnitude of the penalty being
+chosen to satisfy the corresponding *original* (non-proxy) constraint.
+
+On a problem including proxy constraints—even a convex problem—the
+Lagrangian approach discussed above isn't guaranteed to work. However, a
+different algorithm, based on minimizing *swap regret*, does work. Aside from
+this difference, the recommended procedure for optimizing a proxy-constrained
+problem remains the same: periodically snapshot the model during optimization,
+and then either find the best $$m+1$$-sized distribution, or heuristically
+choose the single best snapshot.
+
+## Components
+
+*   [constrained_minimization_problem](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py):
+    contains the `ConstrainedMinimizationProblem` interface. Your own
+    constrained optimization problems should be represented using
+    implementations of this interface.
+
+*   [constrained_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py):
+    contains the `ConstrainedOptimizer` interface, which is similar to (but
+    different from) `tf.train.Optimizer`, with the main difference being that
+    `ConstrainedOptimizer`s are given `ConstrainedMinimizationProblem`s to
+    optimize, and perform constrained optimization.
+
+    *   [external_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py):
+        contains the `AdditiveExternalRegretOptimizer` implementation, which is
+        a `ConstrainedOptimizer` implementing the Lagrangian approach discussed
+        above (with additive updates to the Lagrange multipliers). You should
+        use this optimizer for problems *without* proxy constraints. It may also
+        work for problems with proxy constraints, but we recommend using a swap
+        regret optimizer, instead.
+
+        This optimizer is most similar to Algorithm 3 in Appendix C.3 of
+        [CoJiSr18], and is discussed in Section 3. The two differences are that
+        it uses proxy constraints (if they're provided) in the update of the
+        model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for
+        the "inner" updates.
+
+    *   [swap_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py):
+        contains the `AdditiveSwapRegretOptimizer` and
+        `MultiplicativeSwapRegretOptimizer` implementations, which are
+        `ConstrainedOptimizer`s implementing the swap-regret minimization
+        approach mentioned above (with additive or multiplicative updates,
+        respectively, to the parameters associated with the
+        constraints—these parameters are not Lagrange multipliers, but
+        play a similar role). You should use one of these optimizers (we suggest
+        `MultiplicativeSwapRegretOptimizer`) for problems *with* proxy
+        constraints.
+
+        The `MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2
+        in Section 4 of [CoJiSr18], with the difference being that it uses
+        `tf.train.Optimizer`s, instead of SGD, for the "inner" updates. The
+        `AdditiveSwapRegretOptimizer` differs further in that it performs
+        additive (instead of multiplicative) updates of the stochastic matrix.
+
+*   [candidates](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/candidates.py):
+    contains two functions, `find_best_candidate_distribution` and
+    `find_best_candidate_index`. Both of these functions are given a set of
+    candidate solutions to a constrained optimization problem, from which the
+    former finds the best distribution over at most $$m+1$$ candidates, and the
+    latter heuristically finds the single best candidate. As discussed above,
+    the set of candidates will typically be model snapshots saved periodically
+    during optimization. Both of these functions require that scipy be
+    installed.
+
+    The `find_best_candidate_distribution` function implements the approach
+    described in Lemma 3 of [CoJiSr18], while `find_best_candidate_index`
+    implements the heuristic used for hyperparameter search in the experiments
+    of Section 5.2.
+
+## Convex Example with Proxy Constraints
+
+This is a simple example of recall-constrained optimization on simulated data:
+we will try to find a classifier that minimizes the average hinge loss while
+constraining recall to be at least 90%.
+
+We'll start with the required imports—notice the definition of `tfco`:
+
+```python
+import math
+import numpy as np
+import tensorflow as tf
+
+tfco = tf.contrib.constrained_optimization
+```
+
+We'll now create an implementation of the `ConstrainedMinimizationProblem` class
+for this problem. The constructor takes three parameters: a Tensor containing
+the classification labels (0 or 1) for every training example, another Tensor
+containing the model's predictions on every training example (sometimes called
+the "logits"), and the lower bound on recall that will be enforced using a
+constraint.
+
+This implementation will contain both constraints *and* proxy constraints: the
+former represents the constraint that the true recall (defined in terms of the
+*number* of true positives) be at least `recall_lower_bound`, while the latter
+represents the same constraint, but on a hinge approximation of the recall.
+
+```python
+class ExampleProblem(tfco.ConstrainedMinimizationProblem):
+
+  def __init__(self, labels, predictions, recall_lower_bound):
+    self._labels = labels
+    self._predictions = predictions
+    self._recall_lower_bound = recall_lower_bound
+    # The number of positively-labeled examples.
+    self._positive_count = tf.reduce_sum(self._labels)
+
+  @property
+  def objective(self):
+    return tf.losses.hinge_loss(labels=self._labels, logits=self._predictions)
+
+  @property
+  def constraints(self):
+    true_positives = self._labels * tf.to_float(self._predictions > 0)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # The constraint is (recall >= self._recall_lower_bound), which we convert
+    # to (self._recall_lower_bound - recall <= 0) because
+    # ConstrainedMinimizationProblems must always provide their constraints in
+    # the form (tensor <= 0).
+    #
+    # The result of this function should be a tensor, with each element being
+    # a quantity that is constrained to be nonpositive. We only have one
+    # constraint, so we return a one-element tensor.
+    return self._recall_lower_bound - recall
+
+  @property
+  def proxy_constraints(self):
+    # Use 1 - hinge since we're SUBTRACTING recall in the constraint function,
+    # and we want the proxy constraint function to be convex.
+    true_positives = self._labels * tf.minimum(1.0, self._predictions)
+    true_positive_count = tf.reduce_sum(true_positives)
+    recall = true_positive_count / self._positive_count
+    # Please see the corresponding comment in the constraints property.
+    return self._recall_lower_bound - recall
+```
+
+We'll now create a simple simulated dataset by sampling 1000 random
+10-dimensional feature vectors from a Gaussian, finding their labels using a
+random "ground truth" linear model, and then adding noise by randomly flipping
+200 labels.
+
+```python
+# Create a simulated 10-dimensional training dataset consisting of 1000 labeled
+# examples, of which 800 are labeled correctly and 200 are mislabeled.
+num_examples = 1000
+num_mislabeled_examples = 200
+dimension = 10
+# We will constrain the recall to be at least 90%.
+recall_lower_bound = 0.9
+
+# Create random "ground truth" parameters to a linear model.
+ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension)
+ground_truth_threshold = 0
+
+# Generate a random set of features for each example.
+features = np.random.normal(size=(num_examples, dimension)).astype(
+    np.float32) / math.sqrt(dimension)
+# Compute the labels from these features given the ground truth linear model.
+labels = (np.matmul(features, ground_truth_weights) >
+          ground_truth_threshold).astype(np.float32)
+# Add noise by randomly flipping num_mislabeled_examples labels.
+mislabeled_indices = np.random.choice(
+    num_examples, num_mislabeled_examples, replace=False)
+labels[mislabeled_indices] = 1 - labels[mislabeled_indices]
+```
+
+We're now ready to construct our model, and the corresponding optimization
+problem. We'll use a linear model of the form $$f(x) = w^T x - t$$, where $$w$$
+is the `weights`, and $$t$$ is the `threshold`. The `problem` variable will hold
+an instance of the `ExampleProblem` class we created earlier.
+
+```python
+# Create variables containing the model parameters.
+weights = tf.Variable(tf.zeros(dimension), dtype=tf.float32, name="weights")
+threshold = tf.Variable(0.0, dtype=tf.float32, name="threshold")
+
+# Create the optimization problem.
+constant_labels = tf.constant(labels, dtype=tf.float32)
+constant_features = tf.constant(features, dtype=tf.float32)
+predictions = tf.tensordot(constant_features, weights, axes=(1, 0)) - threshold
+problem = ExampleProblem(
+    labels=constant_labels,
+    predictions=predictions,
+    recall_lower_bound=recall_lower_bound,
+)
+```
+
+We're almost ready to train our model, but first we'll create a couple of
+functions to measure its performance. We're interested in two quantities: the
+average hinge loss (which we seek to minimize), and the recall (which we
+constrain).
+
+```python
+def average_hinge_loss(labels, predictions):
+  num_examples, = np.shape(labels)
+  signed_labels = (labels * 2) - 1
+  total_hinge_loss = np.sum(np.maximum(0.0, 1.0 - signed_labels * predictions))
+  return total_hinge_loss / num_examples
+
+def recall(labels, predictions):
+  positive_count = np.sum(labels)
+  true_positives = labels * (predictions > 0)
+  true_positive_count = np.sum(true_positives)
+  return true_positive_count / positive_count
+```
+
+As was mentioned earlier, external regret optimizers suffice for problems
+without proxy constraints, but swap regret optimizers are recommended for
+problems *with* proxy constraints. Since this problem contains proxy
+constraints, we use the `MultiplicativeSwapRegretOptimizer`.
+
+For this problem, the constraint is fairly easy to satisfy, so we can use the
+same "inner" optimizer (an `AdagradOptimizer` with a learning rate of 1) for
+optimization of both the model parameters (`weights` and `threshold`), and the
+internal parameters associated with the constraints (these are the analogues of
+the Lagrange multipliers used by the `MultiplicativeSwapRegretOptimizer`). For
+more difficult problems, it will often be necessary to use different optimizers,
+with different learning rates (presumably found via a hyperparameter search): to
+accomplish this, pass *both* the `optimizer` and `constraint_optimizer`
+parameters to `MultiplicativeSwapRegretOptimizer`'s constructor.
+
+Since this is a convex problem (both the objective and proxy constraint
+functions are convex), we can just take the last iterate. Periodic snapshotting,
+and the use of the `find_best_candidate_distribution` or
+`find_best_candidate_index` functions, is generally only necessary for
+non-convex problems (and even then, it isn't *always* necessary).
+
+```python
+with tf.Session() as session:
+  optimizer = tfco.MultiplicativeSwapRegretOptimizer(
+      optimizer=tf.train.AdagradOptimizer(learning_rate=1.0))
+  train_op = optimizer.minimize(problem)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Constrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Constrained recall = %f" % recall(labels, trained_predictions))
+```
+
+Running the above code gives the following output (due to the randomness of the
+dataset, you'll get a different result when you run it):
+
+```none
+Constrained average hinge loss = 0.710019
+Constrained recall = 0.899811
+```
+
+As we hoped, the recall is extremely close to 90%—and, thanks to the use
+of proxy constraints, this is the *true* recall, not a hinge approximation.
+
+For comparison, let's try optimizing the same problem *without* the recall
+constraint:
+
+```python
+with tf.Session() as session:
+  optimizer = tf.train.AdagradOptimizer(learning_rate=1.0)
+  # For optimizing the unconstrained problem, we just minimize the "objective"
+  # portion of the minimization problem.
+  train_op = optimizer.minimize(problem.objective)
+
+  session.run(tf.global_variables_initializer())
+  for ii in xrange(1000):
+    session.run(train_op)
+
+  trained_weights, trained_threshold = session.run((weights, threshold))
+
+trained_predictions = np.matmul(features, trained_weights) - trained_threshold
+print("Unconstrained average hinge loss = %f" % average_hinge_loss(
+    labels, trained_predictions))
+print("Unconstrained recall = %f" % recall(labels, trained_predictions))
+```
+
+This code gives the following output (again, you'll get a different answer,
+since the dataset is random):
+
+```none
+Unconstrained average hinge loss = 0.627271
+Unconstrained recall = 0.793951
+```
+
+Because there is no constraint, the unconstrained problem does a better job of
+minimizing the average hinge loss, but naturally doesn't approach 90% recall.
diff --git a/tensorflow/contrib/constrained_optimization/__init__.py b/tensorflow/contrib/constrained_optimization/__init__.py
new file mode 100644
index 00000000000..1e49ba9f179
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library for performing constrained optimization in TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.constrained_optimization.python.candidates import *
+from tensorflow.contrib.constrained_optimization.python.constrained_minimization_problem import *
+from tensorflow.contrib.constrained_optimization.python.constrained_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.external_regret_optimizer import *
+from tensorflow.contrib.constrained_optimization.python.swap_regret_optimizer import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "AdditiveExternalRegretOptimizer",
+    "AdditiveSwapRegretOptimizer",
+    "ConstrainedMinimizationProblem",
+    "ConstrainedOptimizer",
+    "find_best_candidate_distribution",
+    "find_best_candidate_index",
+    "MultiplicativeSwapRegretOptimizer",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py
new file mode 100644
index 00000000000..ac86a6741be
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for optimizing over a set of candidate solutions.
+
+The functions in this file deal with the constrained problem:
+
+> minimize f(w)
+> s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+Here, f(w) is the "objective function", and g_i(w) is the ith (of m) "constraint
+function". Given the values of the objective and constraint functions for a set
+of n "candidate solutions" {w_0,w_1,...,w_{n-1}} (for a total of n objective
+function values, and n*m constraint function values), the
+`find_best_candidate_distribution` function finds the best DISTRIBUTION over
+these candidates, while `find_best_candidate_index' heuristically finds the
+single best candidate.
+
+Both of these functions have dependencies on `scipy`, so if you want to call
+them, then you must make sure that `scipy` is available. The imports are
+performed inside the functions themselves, so if they're not actually called,
+then `scipy` is not needed.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The `find_best_candidate_distribution` function implements the approach
+described in Lemma 3, while `find_best_candidate_index` implements the heuristic
+used for hyperparameter search in the experiments of Section 5.2.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+def _find_best_candidate_distribution_helper(objective_vector,
+                                             constraints_matrix,
+                                             maximum_violation=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by no more than `maximum_violation`. If no such distribution
+  exists, it returns an error (using Go-style error reporting).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    maximum_violation: nonnegative float, the maximum amount by which any
+      constraint may be violated, in expectation.
+
+  Returns:
+    A pair (`result`, `message`), exactly one of which is None. If `message` is
+      None, then the `result` contains the optimal distribution as a numpy array
+      of shape (n,). If `result` is None, then `message` contains an error
+      message.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `maximum_violation` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if maximum_violation < 0.0:
+    raise ValueError("maximum_violation must be nonnegative")
+
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.optimize  # pylint: disable=g-import-not-at-top
+
+  # Feasibility (within maximum_violation) constraints.
+  a_ub = constraints_matrix
+  b_ub = np.full((mm, 1), maximum_violation)
+  # Sum-to-one constraint.
+  a_eq = np.ones((1, nn))
+  b_eq = np.ones((1, 1))
+  # Nonnegativity constraints.
+  bounds = (0, None)
+
+  result = scipy.optimize.linprog(
+      objective_vector,
+      A_ub=a_ub,
+      b_ub=b_ub,
+      A_eq=a_eq,
+      b_eq=b_eq,
+      bounds=bounds)
+  # Go-style error reporting. We don't raise on error, since
+  # find_best_candidate_distribution() needs to handle the failure case, and we
+  # shouldn't use exceptions as flow-control.
+  if not result.success:
+    return (None, result.message)
+  else:
+    return (result.x, None)
+
+
+def find_best_candidate_distribution(objective_vector,
+                                     constraints_matrix,
+                                     epsilon=0.0):
+  """Finds a distribution minimizing an objective subject to constraints.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n
+  candidates that, in expectation, minimizes the objective while violating
+  the constraints by the smallest possible amount (with the amount being found
+  via bisection search).
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  This function will return a distribution for which at most m+1 probabilities,
+  and often fewer, are nonzero.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the approach described in Lemma 3.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    epsilon: nonnegative float, the threshold at which to terminate the binary
+      search while searching for the minimal expected constraint violation
+      magnitude.
+
+  Returns:
+    The optimal distribution, as a numpy array of shape (n,).
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes, or if `epsilon` is negative.
+    ImportError: If we're unable to import `scipy.optimize`.
+  """
+  if epsilon < 0.0:
+    raise ValueError("epsilon must be nonnegative")
+
+  # If there is a feasible solution (i.e. with maximum_violation=0), then that's
+  # what we'll return.
+  pp, _ = _find_best_candidate_distribution_helper(objective_vector,
+                                                   constraints_matrix)
+  if pp is not None:
+    return pp
+
+  # The bound is the minimum over all candidates, of the maximum per-candidate
+  # constraint violation.
+  lower = 0.0
+  upper = np.min(np.amax(constraints_matrix, axis=0))
+  best_pp, _ = _find_best_candidate_distribution_helper(
+      objective_vector, constraints_matrix, maximum_violation=upper)
+  assert best_pp is not None
+
+  # Throughout this loop, a maximum_violation of "lower" is not achievable,
+  # but a maximum_violation of "upper" is achiveable.
+  while True:
+    middle = 0.5 * (lower + upper)
+    if (middle - lower <= epsilon) or (upper - middle <= epsilon):
+      break
+    else:
+      pp, _ = _find_best_candidate_distribution_helper(
+          objective_vector, constraints_matrix, maximum_violation=middle)
+      if pp is None:
+        lower = middle
+      else:
+        best_pp = pp
+        upper = middle
+
+  return best_pp
+
+
+def find_best_candidate_index(objective_vector,
+                              constraints_matrix,
+                              rank_objectives=False):
+  """Heuristically finds the best candidate solution to a constrained problem.
+
+  This function deals with the constrained problem:
+
+  > minimize f(w)
+  > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1}
+
+  Here, f(w) is the "objective function", and g_i(w) is the ith (of m)
+  "constraint function". Given a set of n "candidate solutions"
+  {w_0,w_1,...,w_{n-1}}, this function finds the "best" solution according
+  to the following heuristic:
+
+    1. Across all models, the ith constraint violations (i.e. max{0, g_i(0)})
+       are ranked, as are the objectives (if rank_objectives=True).
+    2. Each model is then associated its MAXIMUM rank across all m constraints
+       (and the objective, if rank_objectives=True).
+    3. The model with the minimal maximum rank is then identified. Ties are
+       broken using the objective function value.
+    4. The index of this "best" model is returned.
+
+  The `objective_vector` parameter should be a numpy array with shape (n,), for
+  which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a
+  numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j).
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  This function implements the heuristic used for hyperparameter search in the
+  experiments of Section 5.2.
+
+  Args:
+    objective_vector: numpy array of shape (n,), where n is the number of
+      "candidate solutions". Contains the objective function values.
+    constraints_matrix: numpy array of shape (m,n), where m is the number of
+      constraints and n is the number of "candidate solutions". Contains the
+      constraint violation magnitudes.
+    rank_objectives: bool, whether the objective function values should be
+      included in the initial ranking step. If True, both the objective and
+      constraints will be ranked. If False, only the constraints will be ranked.
+      In either case, the objective function values will be used for
+      tiebreaking.
+
+  Returns:
+    The index (in {0,1,...,n-1}) of the "best" model according to the above
+      heuristic.
+
+  Raises:
+    ValueError: If `objective_vector` and `constraints_matrix` have inconsistent
+      shapes.
+    ImportError: If we're unable to import `scipy.stats`.
+  """
+  mm, nn = np.shape(constraints_matrix)
+  if (nn,) != np.shape(objective_vector):
+    raise ValueError(
+        "objective_vector must have shape (n,), and constraints_matrix (m, n),"
+        " where n is the number of candidates, and m is the number of "
+        "constraints")
+
+  # We import scipy inline, instead of at the top of the file, so that a scipy
+  # dependency is only introduced if either find_best_candidate_distribution()
+  # or find_best_candidate_index() are actually called.
+  import scipy.stats  # pylint: disable=g-import-not-at-top
+
+  if rank_objectives:
+    maximum_ranks = scipy.stats.rankdata(objective_vector, method="min")
+  else:
+    maximum_ranks = np.zeros(nn, dtype=np.int64)
+  for ii in xrange(mm):
+    # Take the maximum of the constraint functions with zero, since we want to
+    # rank the magnitude of constraint *violations*. If the constraint is
+    # satisfied, then we don't care how much it's satisfied by (as a result, we
+    # we expect all models satisfying a constraint to be tied at rank 1).
+    ranks = scipy.stats.rankdata(
+        np.maximum(0.0, constraints_matrix[ii, :]), method="min")
+    maximum_ranks = np.maximum(maximum_ranks, ranks)
+
+  best_index = None
+  best_rank = float("Inf")
+  best_objective = float("Inf")
+  for ii in xrange(nn):
+    if maximum_ranks[ii] < best_rank:
+      best_index = ii
+      best_rank = maximum_ranks[ii]
+      best_objective = objective_vector[ii]
+    elif (maximum_ranks[ii] == best_rank) and (objective_vector[ii] <=
+                                               best_objective):
+      best_index = ii
+      best_objective = objective_vector[ii]
+
+  return best_index
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
new file mode 100644
index 00000000000..a4c49d48bc5
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.candidates."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import candidates
+from tensorflow.python.platform import test
+
+
+class CandidatesTest(test.TestCase):
+
+  def test_inconsistent_shapes_for_best_distribution(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_distribution(objective_vector,
+                                                      constraints_matrix)
+
+  def test_inconsistent_shapes_for_best_index(self):
+    """An error is raised when parameters have inconsistent shapes."""
+    objective_vector = np.array([1, 2, 3])
+    constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    with self.assertRaises(ValueError):
+      _ = candidates.find_best_candidate_index(objective_vector,
+                                               constraints_matrix)
+
+  def test_best_distribution(self):
+    """Distribution should match known solution."""
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    distribution = candidates.find_best_candidate_distribution(
+        objective_vector, constraints_matrix)
+    # Verify that the solution is a probability distribution.
+    self.assertTrue(np.all(distribution >= 0))
+    self.assertAlmostEqual(np.sum(distribution), 1.0)
+    # Verify that the solution satisfies the constraints.
+    maximum_constraint_violation = np.amax(
+        np.dot(constraints_matrix, distribution))
+    self.assertLessEqual(maximum_constraint_violation, 0)
+    # Verify that the solution matches that which we expect.
+    expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
+    self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
+
+  def test_best_index_rank_objectives_true(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 3].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=True)
+    self.assertEqual(1, index)
+
+  def test_best_index_rank_objectives_false(self):
+    """Index should match known solution."""
+    # Objective ranks = [2, 1, 4, 3].
+    objective_vector = np.array(
+        [0.03053309, -0.06667082, 0.88355145, 0.46529806])
+    # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]].
+    constraints_matrix = np.array(
+        [[-0.60164551, 0.36676229, 0.7856454, -0.8441711],
+         [0.00371592, -0.16392108, -0.59778071, -0.56908492]])
+    # Maximum ranks = [4, 3, 4, 1].
+    index = candidates.find_best_candidate_index(
+        objective_vector, constraints_matrix, rank_objectives=False)
+    self.assertEqual(3, index)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
new file mode 100644
index 00000000000..70813fb2179
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines abstract class for `ConstrainedMinimizationProblem`s.
+
+A ConstrainedMinimizationProblem consists of an objective function to minimize,
+and a set of constraint functions that are constrained to be nonpositive.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedMinimizationProblem(object):
+  """Abstract class representing a `ConstrainedMinimizationProblem`.
+
+  A ConstrainedMinimizationProblem consists of an objective function to
+  minimize, and a set of constraint functions that are constrained to be
+  nonpositive.
+
+  In addition to the constraint functions, there may (optionally) be proxy
+  constraint functions: a ConstrainedOptimizer will attempt to penalize these
+  proxy constraint functions so as to satisfy the (non-proxy) constraints. Proxy
+  constraints could be used if the constraints functions are difficult or
+  impossible to optimize (e.g. if they're piecewise constant), in which case the
+  proxy constraints should be some approximation of the original constraints
+  that is well-enough behaved to permit successful optimization.
+  """
+
+  @abc.abstractproperty
+  def objective(self):
+    """Returns the objective function.
+
+    Returns:
+      A 0d tensor that should be minimized.
+    """
+    pass
+
+  @property
+  def num_constraints(self):
+    """Returns the number of constraints.
+
+    Returns:
+      An int containing the number of constraints.
+
+    Raises:
+      ValueError: If the constraints (or proxy_constraints, if present) do not
+        have fully-known shapes, OR if proxy_constraints are present, and the
+        shapes of constraints and proxy_constraints are fully-known, but they're
+        different.
+    """
+    constraints_shape = self.constraints.get_shape()
+    if self.proxy_constraints is None:
+      proxy_constraints_shape = constraints_shape
+    else:
+      proxy_constraints_shape = self.proxy_constraints.get_shape()
+
+    if (constraints_shape is None or proxy_constraints_shape is None or
+        any([ii is None for ii in constraints_shape.as_list()]) or
+        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+      raise ValueError(
+          "constraints and proxy_constraints must have fully-known shapes")
+    if constraints_shape != proxy_constraints_shape:
+      raise ValueError(
+          "constraints and proxy_constraints must have the same shape")
+
+    size = 1
+    for ii in constraints_shape.as_list():
+      size *= ii
+    return int(size)
+
+  @abc.abstractproperty
+  def constraints(self):
+    """Returns the vector of constraint functions.
+
+    Letting g_i be the ith element of the constraints vector, the ith constraint
+    will be g_i <= 0.
+
+    Returns:
+      A tensor of constraint functions.
+    """
+    pass
+
+  # This is a property, instead of an abstract property, since it doesn't need
+  # to be overridden: if proxy_constraints returns None, then there are no
+  # proxy constraints.
+  @property
+  def proxy_constraints(self):
+    """Returns the optional vector of proxy constraint functions.
+
+    The difference between `constraints` and `proxy_constraints` is that, when
+    proxy constraints are present, the `constraints` are merely EVALUATED during
+    optimization, whereas the `proxy_constraints` are DIFFERENTIATED. If there
+    are no proxy constraints, then the `constraints` are both evaluated and
+    differentiated.
+
+    For example, if we want to impose constraints on step functions, then we
+    could use these functions for `constraints`. However, because a step
+    function has zero gradient almost everywhere, we can't differentiate these
+    functions, so we would take `proxy_constraints` to be some differentiable
+    approximation of `constraints`.
+
+    Returns:
+      A tensor of proxy constraint functions.
+    """
+    return None
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
new file mode 100644
index 00000000000..80555453661
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
@@ -0,0 +1,208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines base class for `ConstrainedOptimizer`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ConstrainedOptimizer(object):
+  """Base class representing a constrained optimizer.
+
+  A ConstrainedOptimizer wraps a tf.train.Optimizer (or more than one), and
+  applies it to a ConstrainedMinimizationProblem. Unlike a tf.train.Optimizer,
+  which takes a tensor to minimize as a parameter to its minimize() method, a
+  constrained optimizer instead takes a ConstrainedMinimizationProblem.
+  """
+
+  def __init__(self, optimizer):
+    """Constructs a new `ConstrainedOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the
+        ConstraintedMinimizationProblem.
+
+    Returns:
+      A new `ConstrainedOptimizer`.
+    """
+    self._optimizer = optimizer
+
+  @property
+  def optimizer(self):
+    """Returns the `tf.train.Optimizer` used for optimization."""
+    return self._optimizer
+
+  def minimize_unconstrained(self,
+                             minimization_problem,
+                             global_step=None,
+                             var_list=None,
+                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                             aggregation_method=None,
+                             colocate_gradients_with_ops=False,
+                             name=None,
+                             grad_loss=None):
+    """Returns an `Op` for minimizing the unconstrained problem.
+
+    Unlike `minimize_constrained`, this function ignores the `constraints` (and
+    `proxy_constraints`) portion of the minimization problem entirely, and only
+    minimizes `objective`.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    return self.optimizer.minimize(
+        minimization_problem.objective,
+        global_step=global_step,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        name=name,
+        grad_loss=grad_loss)
+
+  @abc.abstractmethod
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    Unlike `minimize_unconstrained`, this function attempts to find a solution
+    that minimizes the `objective` portion of the minimization problem while
+    satisfying the `constraints` portion.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    pass
+
+  def minimize(self,
+               minimization_problem,
+               unconstrained_steps=None,
+               global_step=None,
+               var_list=None,
+               gate_gradients=train_optimizer.Optimizer.GATE_OP,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    This method combines the functionality of `minimize_unconstrained` and
+    `minimize_constrained`. If global_step < unconstrained_steps, it will
+    perform an unconstrained update, and if global_step >= unconstrained_steps,
+    it will perform a constrained update.
+
+    The reason for this functionality is that it may be best to initialize the
+    constrained optimizer with an approximate optimum of the unconstrained
+    problem.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      unconstrained_steps: int, number of steps for which we should perform
+        unconstrained updates, before transitioning to constrained updates.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+
+    Raises:
+      ValueError: If unconstrained_steps is provided, but global_step is not.
+    """
+
+    def unconstrained_fn():
+      """Returns an `Op` for minimizing the unconstrained problem."""
+      return self.minimize_unconstrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    def constrained_fn():
+      """Returns an `Op` for minimizing the constrained problem."""
+      return self.minimize_constrained(
+          minimization_problem=minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    if unconstrained_steps is not None:
+      if global_step is None:
+        raise ValueError(
+            "global_step cannot be None if unconstrained_steps is provided")
+      unconstrained_steps_tensor = ops.convert_to_tensor(unconstrained_steps)
+      dtype = unconstrained_steps_tensor.dtype
+      return control_flow_ops.cond(
+          standard_ops.cast(global_step, dtype) < unconstrained_steps_tensor,
+          true_fn=unconstrained_fn,
+          false_fn=constrained_fn)
+    else:
+      return constrained_fn()
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
new file mode 100644
index 00000000000..01c6e4f08af
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -0,0 +1,375 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `AdditiveExternalRegretOptimizer`.
+
+This optimizer minimizes a `ConstrainedMinimizationProblem` by introducing
+Lagrange multipliers, and using `tf.train.Optimizer`s to jointly optimize over
+the model parameters and Lagrange multipliers.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by the AdditiveExternalRegretOptimizer--which is simply the
+usual Lagrangian formulation--can be found in Definition 1, and is discussed in
+Section 3. This optimizer is most similar to Algorithm 3 in Appendix C.3, with
+the two differences being that it uses proxy constraints (if they're provided)
+in the update of the model parameters, and uses `tf.train.Optimizer`s, instead
+of SGD, for the "inner" updates.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
+  """Projects its argument onto the feasible region.
+
+  The feasible region is the set of all vectors with nonnegative elements that
+  sum to at most `radius`.
+
+  Args:
+    multipliers: 1d tensor, the Lagrange multipliers to project.
+    radius: float, the radius of the feasible region.
+
+  Returns:
+    The 1d tensor that results from projecting `multipliers` onto the feasible
+      region w.r.t. the Euclidean norm.
+
+  Raises:
+    ValueError: if the `multipliers` tensor does not have a fully-known shape,
+      or is not one-dimensional.
+  """
+  multipliers_shape = multipliers.get_shape()
+  if multipliers_shape is None:
+    raise ValueError("multipliers must have known shape")
+  if multipliers_shape.ndims != 1:
+    raise ValueError(
+        "multipliers must be one dimensional (instead is %d-dimensional)" %
+        multipliers_shape.ndims)
+  dimension = multipliers_shape[0].value
+  if dimension is None:
+    raise ValueError("multipliers must have fully-known shape")
+
+  def while_loop_condition(iteration, multipliers, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del multipliers  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, multipliers, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = standard_ops.minimum(
+        0.0,
+        (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive)))
+    multipliers += scale * inactive
+    new_inactive = standard_ops.to_float(multipliers > 0)
+    multipliers *= new_inactive
+    return (iteration, multipliers, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(multipliers)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, multipliers, inactive, old_inactive = while_loop_body(
+      iteration, multipliers, inactive, inactive)
+  iteration, multipliers, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, multipliers, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return multipliers
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing an `_ExternalRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained
+  optimization, minimizing external regret for the constraints player. What it
+  *doesn't* do is keep track of the internal state (the Lagrange multipliers).
+  Instead, the state is accessed via the _initial_state(),
+  _lagrange_multipliers(), _constraint_grad_and_var() and _projection_op()
+  methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_ExternalRegretOptimizer`s--which is simply the usual
+  Lagrangian formulation--can be found in Definition 1, and is discussed in
+  Section 3. Such optimizers are most similar to Algorithm 3 in Appendix C.3.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_ExternalRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the Lagrange multipliers. If no
+    `constraint_optimizer` is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of the ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+
+    Returns:
+      A new `_ExternalRegretOptimizer`.
+    """
+    super(_ExternalRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the Lagrange multipliers."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _lagrange_multipliers(self, state):
+    pass
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the Lagrange multipliers will be updated using
+    `constrained_optimizer` (if provided) or `optimizer` (if not).
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="external_regret_optimizer_state")
+
+    multipliers = self._lagrange_multipliers(state)
+    loss = (
+        objective + standard_ops.tensordot(multipliers, proxy_constraints, 1))
+    multipliers_gradient = constraints
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, multipliers_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      multiplier_grads_and_vars = [
+          self._constraint_grad_and_var(state, multipliers_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                multiplier_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer):
+  """A `ConstrainedOptimizer` based on external-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over Lagrange multipliers,
+  with the latter maximization using additive updates and an algorithm that
+  minimizes external regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer--which is simply the usual Lagrangian
+  formulation--can be found in Definition 1, and is discussed in Section 3. It
+  is most similar to Algorithm 3 in Appendix C.3, with the two differences being
+  that it uses proxy constraints (if they're provided) in the update of the
+  model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for the
+  "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Constructs a new `AdditiveExternalRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multipliers.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multipliers.
+      maximum_multiplier_radius: float, an optional upper bound to impose on the
+        sum of the Lagrange multipliers.
+
+    Returns:
+      A new `AdditiveExternalRegretOptimizer`.
+
+    Raises:
+      ValueError: If the maximum_multiplier_radius parameter is nonpositive.
+    """
+    super(AdditiveExternalRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if maximum_multiplier_radius and (maximum_multiplier_radius <= 0.0):
+      raise ValueError("maximum_multiplier_radius must be strictly positive")
+
+    self._maximum_multiplier_radius = maximum_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveExternalRegretOptimizer, the internal state is simply a
+    # tensor of Lagrange multipliers with shape (m,), where m is the number of
+    # constraints.
+    return standard_ops.zeros((num_constraints,), dtype=dtypes.float32)
+
+  def _lagrange_multipliers(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      if self._maximum_multiplier_radius:
+        projected_multipliers = _project_multipliers_wrt_euclidean_norm(
+            state, self._maximum_multiplier_radius)
+      else:
+        projected_multipliers = standard_ops.maximum(state, 0.0)
+      return state_ops.assign(state, projected_multipliers, name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
new file mode 100644
index 00000000000..9b4bf627100
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py
@@ -0,0 +1,136 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.external_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import external_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveExternalRegretOptimizerWrapper(
+    external_regret_optimizer.AdditiveExternalRegretOptimizer):
+  """Testing wrapper class around AdditiveExternalRegretOptimizer.
+
+  This class is identical to AdditiveExternalRegretOptimizer, except that it
+  caches the internal optimization state when _lagrange_multipliers() is called,
+  so that we can test that the Lagrange multipliers take on their expected
+  values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               maximum_multiplier_radius=None):
+    """Same as AdditiveExternalRegretOptimizer.__init__."""
+    super(AdditiveExternalRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        maximum_multiplier_radius=maximum_multiplier_radius)
+    self._cached_lagrange_multipliers = None
+
+  @property
+  def lagrange_multipliers(self):
+    """Returns the cached Lagrange multipliers."""
+    return self._cached_lagrange_multipliers
+
+  def _lagrange_multipliers(self, state):
+    """Caches the internal state for testing."""
+    self._cached_lagrange_multipliers = super(
+        AdditiveExternalRegretOptimizerWrapper,
+        self)._lagrange_multipliers(state)
+    return self._cached_lagrange_multipliers
+
+
+class ExternalRegretOptimizerTest(test.TestCase):
+
+  def test_project_multipliers_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    multipliers1 = standard_ops.constant([-0.1, -0.6, -0.3])
+    expected_projected_multipliers1 = np.array([0.0, 0.0, 0.0])
+
+    multipliers2 = standard_ops.constant([-0.1, 0.6, 0.3])
+    expected_projected_multipliers2 = np.array([0.0, 0.6, 0.3])
+
+    multipliers3 = standard_ops.constant([0.4, 0.7, -0.2, 0.5, 0.1])
+    expected_projected_multipliers3 = np.array([0.2, 0.5, 0.0, 0.3, 0.0])
+
+    with self.test_session() as session:
+      projected_multipliers1 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers1, 1.0))
+      projected_multipliers2 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers2, 1.0))
+      projected_multipliers3 = session.run(
+          external_regret_optimizer._project_multipliers_wrt_euclidean_norm(
+              multipliers3, 1.0))
+
+    self.assertAllClose(
+        expected_projected_multipliers1,
+        projected_multipliers1,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers2,
+        projected_multipliers2,
+        rtol=0,
+        atol=1e-6)
+    self.assertAllClose(
+        expected_projected_multipliers3,
+        projected_multipliers3,
+        rtol=0,
+        atol=1e-6)
+
+  def test_additive_external_regret_optimizer(self):
+    """Tests that the Lagrange multipliers update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveExternalRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        maximum_multiplier_radius=1.0)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    expected_multipliers = [
+        np.array([0.0, 0.0, 0.0]),
+        np.array([0.6, 0.0, 0.4]),
+        np.array([0.7, 0.0, 0.3]),
+        np.array([0.8, 0.0, 0.2]),
+        np.array([0.9, 0.0, 0.1]),
+        np.array([1.0, 0.0, 0.0]),
+        np.array([1.0, 0.0, 0.0]),
+    ]
+
+    multipliers = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(multipliers) < len(expected_multipliers):
+        multipliers.append(session.run(optimizer.lagrange_multipliers))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_multipliers, multipliers):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
new file mode 100644
index 00000000000..04014ab4aeb
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -0,0 +1,595 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines `{Additive,Multiplicative}SwapRegretOptimizer`s.
+
+These optimizers minimize a `ConstrainedMinimizationProblem` by using a
+swap-regret minimizing algorithm (either SGD or multiplicative weights) to learn
+what weights should be associated with the objective function and constraints.
+These algorithms do *not* use Lagrange multipliers, but the idea is similar.
+The main differences between the formulation used here, and the standard
+Lagrangian formulation, are that (i) the objective function is weighted, in
+addition to the constraints, and (ii) we learn a matrix of weights, instead of a
+vector.
+
+For the purposes of constrained optimization, at least in theory,
+external-regret minimization suffices if the `ConstrainedMinimizationProblem`
+we're optimizing doesn't have any `proxy_constraints`, while swap-regret
+minimization should be used if `proxy_constraints` are present.
+
+For more specifics, please refer to:
+
+> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+> Constrained Optimization".
+> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+The formulation used by both of the SwapRegretOptimizers can be found in
+Definition 2, and is discussed in Section 4. The
+`MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 in Section 4,
+with the difference being that it uses `tf.train.Optimizer`s, instead of SGD,
+for the "inner" updates. The `AdditiveSwapRegretOptimizer` differs further in
+that it performs additive (instead of multiplicative) updates of the stochastic
+matrix.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+import six
+
+from tensorflow.contrib.constrained_optimization.python import constrained_optimizer
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer as train_optimizer
+
+
+def _maximal_eigenvector_power_method(matrix,
+                                      epsilon=1e-6,
+                                      maximum_iterations=100):
+  """Returns the maximal right-eigenvector of `matrix` using the power method.
+
+  Args:
+    matrix: 2D Tensor, the matrix of which we will find the maximal
+      right-eigenvector.
+    epsilon: nonnegative float, if two iterations of the power method differ (in
+      L2 norm) by no more than epsilon, we will terminate.
+    maximum_iterations: nonnegative int, if we perform this many iterations, we
+      will terminate.
+
+  Result:
+    The maximal right-eigenvector of `matrix`.
+
+  Raises:
+    ValueError: If the epsilon or maximum_iterations parameters violate their
+      bounds.
+  """
+  if epsilon <= 0.0:
+    raise ValueError("epsilon must be strictly positive")
+  if maximum_iterations <= 0:
+    raise ValueError("maximum_iterations must be strictly positive")
+
+  def while_loop_condition(iteration, eigenvector, old_eigenvector):
+    """Returns false if the while loop should terminate."""
+    not_done = (iteration < maximum_iterations)
+    not_converged = (standard_ops.norm(eigenvector - old_eigenvector) > epsilon)
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, eigenvector, old_eigenvector):
+    """Performs one iteration of the power method."""
+    del old_eigenvector  # Needed by the condition, but not the body.
+    iteration += 1
+    # We need to use tf.matmul() and tf.expand_dims(), instead of
+    # tf.tensordot(), since the former will infer the shape of the result, while
+    # the latter will not (tf.while_loop() needs the shapes).
+    new_eigenvector = standard_ops.matmul(
+        matrix, standard_ops.expand_dims(eigenvector, 1))[:, 0]
+    new_eigenvector /= standard_ops.norm(new_eigenvector)
+    return (iteration, new_eigenvector, eigenvector)
+
+  iteration = standard_ops.constant(0)
+  eigenvector = standard_ops.ones_like(matrix[:, 0])
+  eigenvector /= standard_ops.norm(eigenvector)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, eigenvector, old_eigenvector = while_loop_body(
+      iteration, eigenvector, eigenvector)
+  iteration, eigenvector, old_eigenvector = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, eigenvector, old_eigenvector),
+      name="power_method")
+
+  return eigenvector
+
+
+def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
+  """Projects its argument onto the set of left-stochastic matrices.
+
+  This algorithm is O(n^3) at worst, where `matrix` is n*n. It can be done in
+  O(n^2 * log(n)) time by sorting each column (and maybe better with a different
+  algorithm), but the algorithm implemented here is easier to implement in
+  TensorFlow.
+
+  Args:
+    matrix: 2d square tensor, the matrix to project.
+
+  Returns:
+    The 2d square tensor that results from projecting `matrix` onto the set of
+      left-stochastic matrices w.r.t. the Euclidean norm applied column-wise
+      (i.e. the Frobenius norm).
+
+  Raises:
+    ValueError: if the `matrix` tensor does not have a fully-known shape, or is
+      not two-dimensional and square.
+  """
+  matrix_shape = matrix.get_shape()
+  if matrix_shape is None:
+    raise ValueError("matrix must have known shape")
+  if matrix_shape.ndims != 2:
+    raise ValueError(
+        "matrix must be two dimensional (instead is %d-dimensional)" %
+        matrix_shape.ndims)
+  if matrix_shape[0] != matrix_shape[1]:
+    raise ValueError("matrix must be be square (instead has shape (%d,%d))" %
+                     (matrix_shape[0], matrix_shape[1]))
+  dimension = matrix_shape[0].value
+  if dimension is None:
+    raise ValueError("matrix must have fully-known shape")
+
+  def while_loop_condition(iteration, matrix, inactive, old_inactive):
+    """Returns false if the while loop should terminate."""
+    del matrix  # Needed by the body, but not the condition.
+    not_done = (iteration < dimension)
+    not_converged = standard_ops.reduce_any(
+        standard_ops.not_equal(inactive, old_inactive))
+    return standard_ops.logical_and(not_done, not_converged)
+
+  def while_loop_body(iteration, matrix, inactive, old_inactive):
+    """Performs one iteration of the projection."""
+    del old_inactive  # Needed by the condition, but not the body.
+    iteration += 1
+    scale = (1.0 - standard_ops.reduce_sum(
+        matrix, axis=0, keep_dims=True)) / standard_ops.maximum(
+            1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True))
+    matrix += scale * inactive
+    new_inactive = standard_ops.to_float(matrix > 0)
+    matrix *= new_inactive
+    return (iteration, matrix, new_inactive, inactive)
+
+  iteration = standard_ops.constant(0)
+  inactive = standard_ops.ones_like(matrix)
+
+  # We actually want a do-while loop, so we explicitly call while_loop_body()
+  # once before tf.while_loop().
+  iteration, matrix, inactive, old_inactive = while_loop_body(
+      iteration, matrix, inactive, inactive)
+  iteration, matrix, inactive, old_inactive = control_flow_ops.while_loop(
+      while_loop_condition,
+      while_loop_body,
+      loop_vars=(iteration, matrix, inactive, old_inactive),
+      name="euclidean_projection")
+
+  return matrix
+
+
+def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix):
+  """Projects its argument onto the set of log-left-stochastic matrices.
+
+  Args:
+    log_matrix: 2d square tensor, the element-wise logarithm of the matrix to
+      project.
+
+  Returns:
+    The 2d square tensor that results from projecting exp(`matrix`) onto the set
+      of left-stochastic matrices w.r.t. the KL-divergence applied column-wise.
+  """
+
+  # For numerical reasons, make sure that the largest matrix element is zero
+  # before exponentiating.
+  log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True)
+  log_matrix -= standard_ops.log(
+      standard_ops.reduce_sum(
+          standard_ops.exp(log_matrix), axis=0, keep_dims=True))
+  return log_matrix
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
+  """Base class representing a `_SwapRegretOptimizer`.
+
+  This class contains most of the logic for performing constrained optimization,
+  minimizing external regret for the constraints player. What it *doesn't* do is
+  keep track of the internal state (the stochastic matrix).  Instead, the state
+  is accessed via the _initial_state(), _stochastic_matrix(),
+  _constraint_grad_and_var() and _projection_op() methods.
+
+  The reason for this is that we want to make it easy to implement different
+  representations of the internal state. For example, for additive updates, it's
+  most natural to store the stochastic matrix directly, whereas for
+  multiplicative updates, it's most natural to store its element-wise logarithm.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by `_SwapRegretOptimizer`s can be found in Definition 2,
+  and is discussed in Section 4. Such optimizers are most similar to Algorithm
+  2 in Section 4. Most notably, the internal state is a left-stochastic matrix
+  of shape (m+1,m+1), where m is the number of constraints.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `_SwapRegretOptimizer`.
+
+    The difference between `optimizer` and `constraint_optimizer` (if the latter
+    is provided) is that the former is used for learning the model parameters,
+    while the latter us used for the update to the constraint/objective weight
+    matrix (the analogue of Lagrange multipliers). If no `constraint_optimizer`
+    is provided, then `optimizer` is used for both.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `_SwapRegretOptimizer`.
+    """
+    super(_SwapRegretOptimizer, self).__init__(optimizer=optimizer)
+    self._constraint_optimizer = constraint_optimizer
+
+  @property
+  def constraint_optimizer(self):
+    """Returns the `tf.train.Optimizer` used for the matrix."""
+    return self._constraint_optimizer
+
+  @abc.abstractmethod
+  def _initial_state(self, num_constraints):
+    pass
+
+  @abc.abstractmethod
+  def _stochastic_matrix(self, state):
+    pass
+
+  def _distribution(self, state):
+    distribution = _maximal_eigenvector_power_method(
+        self._stochastic_matrix(state))
+    distribution = standard_ops.abs(distribution)
+    distribution /= standard_ops.reduce_sum(distribution)
+    return distribution
+
+  @abc.abstractmethod
+  def _constraint_grad_and_var(self, state, gradient):
+    pass
+
+  @abc.abstractmethod
+  def _projection_op(self, state, name=None):
+    pass
+
+  def minimize_constrained(self,
+                           minimization_problem,
+                           global_step=None,
+                           var_list=None,
+                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                           aggregation_method=None,
+                           colocate_gradients_with_ops=False,
+                           name=None,
+                           grad_loss=None):
+    """Returns an `Op` for minimizing the constrained problem.
+
+    The `optimizer` constructor parameter will be used to update the model
+    parameters, while the constraint/objective weight matrix (the analogue of
+    Lagrange multipliers) will be updated using `constrained_optimizer` (if
+    provided) or `optimizer` (if not). Whether the matrix updates are additive
+    or multiplicative depends on the derived class.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      TensorFlow Op.
+    """
+    objective = minimization_problem.objective
+
+    constraints = minimization_problem.constraints
+    proxy_constraints = minimization_problem.proxy_constraints
+    if proxy_constraints is None:
+      proxy_constraints = constraints
+    # Flatten both constraints tensors to 1d.
+    num_constraints = minimization_problem.num_constraints
+    constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
+    proxy_constraints = standard_ops.reshape(
+        proxy_constraints, shape=(num_constraints,))
+
+    # We use a lambda to initialize the state so that, if this function call is
+    # inside the scope of a tf.control_dependencies() block, the dependencies
+    # will not be applied to the initializer.
+    state = standard_ops.Variable(
+        lambda: self._initial_state(num_constraints),
+        trainable=False,
+        name="swap_regret_optimizer_state")
+
+    zero_and_constraints = standard_ops.concat(
+        (standard_ops.zeros((1,)), constraints), axis=0)
+    objective_and_proxy_constraints = standard_ops.concat(
+        (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0)
+
+    distribution = self._distribution(state)
+    loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints,
+                                  1)
+    matrix_gradient = standard_ops.matmul(
+        standard_ops.expand_dims(zero_and_constraints, 1),
+        standard_ops.expand_dims(distribution, 0))
+
+    update_ops = []
+    if self.constraint_optimizer is None:
+      # If we don't have a separate constraint_optimizer, then we use
+      # self._optimizer for both the update of the model parameters, and that of
+      # the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      grads_and_vars.append(
+          self._constraint_grad_and_var(state, matrix_gradient))
+      update_ops.append(
+          self.optimizer.apply_gradients(grads_and_vars, name="update"))
+    else:
+      # If we have a separate constraint_optimizer, then we use self._optimizer
+      # for the update of the model parameters, and self._constraint_optimizer
+      # for that of the internal state.
+      grads_and_vars = self.optimizer.compute_gradients(
+          loss,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          grad_loss=grad_loss)
+      matrix_grads_and_vars = [
+          self._constraint_grad_and_var(state, matrix_gradient)
+      ]
+
+      gradients = [
+          gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars
+          if gradient is not None
+      ]
+      with ops.control_dependencies(gradients):
+        update_ops.append(
+            self.optimizer.apply_gradients(grads_and_vars, name="update"))
+        update_ops.append(
+            self.constraint_optimizer.apply_gradients(
+                matrix_grads_and_vars, name="optimizer_state_update"))
+
+    with ops.control_dependencies(update_ops):
+      if global_step is None:
+        # If we don't have a global step, just project, and we're done.
+        return self._projection_op(state, name=name)
+      else:
+        # If we have a global step, then we need to increment it in addition to
+        # projecting.
+        projection_op = self._projection_op(state, name="project")
+        with ops.colocate_with(global_step):
+          global_step_op = state_ops.assign_add(
+              global_step, 1, name="global_step_increment")
+        return control_flow_ops.group(projection_op, global_step_op, name=name)
+
+
+class AdditiveSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using additive updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the differences being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates, and performs additive (instead of multiplicative) updates
+  of the stochastic matrix.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Constructs a new `AdditiveSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+
+    Returns:
+      A new `AdditiveSwapRegretOptimizer`.
+    """
+    # TODO(acotter): add a parameter determining the initial values of the
+    # matrix elements (like initial_multiplier_radius in
+    # MultiplicativeSwapRegretOptimizer).
+    super(AdditiveSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+  def _initial_state(self, num_constraints):
+    # For an AdditiveSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing a
+    # left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting all weight on the objective, and none on the
+    # constraints.
+    return standard_ops.concat(
+        (standard_ops.ones(
+            (1, dimension)), standard_ops.zeros((dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return state
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      return state_ops.assign(
+          state,
+          _project_stochastic_matrix_wrt_euclidean_norm(state),
+          name=name)
+
+
+class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer):
+  """A `ConstrainedOptimizer` based on swap-regret minimization.
+
+  This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly
+  minimize over the model parameters, and maximize over constraint/objective
+  weight matrix (the analogue of Lagrange multipliers), with the latter
+  maximization using multiplicative updates and an algorithm that minimizes swap
+  regret.
+
+  For more specifics, please refer to:
+
+  > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex
+  > Constrained Optimization".
+  > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500)
+
+  The formulation used by this optimizer can be found in Definition 2, and is
+  discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with
+  the difference being that it uses `tf.train.Optimizer`s, instead of SGD, for
+  the "inner" updates.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=1e-3,
+               initial_multiplier_radius=None):
+    """Constructs a new `MultiplicativeSwapRegretOptimizer`.
+
+    Args:
+      optimizer: tf.train.Optimizer, used to optimize the objective and
+        proxy_constraints portion of ConstrainedMinimizationProblem. If
+        constraint_optimizer is not provided, this will also be used to optimize
+        the Lagrange multiplier analogues.
+      constraint_optimizer: optional tf.train.Optimizer, used to optimize the
+        Lagrange multiplier analogues.
+      minimum_multiplier_radius: float, each element of the matrix will be lower
+        bounded by `minimum_multiplier_radius` divided by one plus the number of
+        constraints.
+      initial_multiplier_radius: float, the initial value of each element of the
+        matrix associated with a constraint (i.e. excluding those elements
+        associated with the objective) will be `initial_multiplier_radius`
+        divided by one plus the number of constraints. Defaults to the value of
+        `minimum_multiplier_radius`.
+
+    Returns:
+      A new `MultiplicativeSwapRegretOptimizer`.
+
+    Raises:
+      ValueError: If the two radius parameters are inconsistent.
+    """
+    super(MultiplicativeSwapRegretOptimizer, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+
+    if (minimum_multiplier_radius <= 0.0) or (minimum_multiplier_radius >= 1.0):
+      raise ValueError("minimum_multiplier_radius must be in the range (0,1)")
+    if initial_multiplier_radius is None:
+      initial_multiplier_radius = minimum_multiplier_radius
+    elif (initial_multiplier_radius <
+          minimum_multiplier_radius) or (minimum_multiplier_radius > 1.0):
+      raise ValueError("initial_multiplier_radius must be in the range "
+                       "[minimum_multiplier_radius,1]")
+
+    self._minimum_multiplier_radius = minimum_multiplier_radius
+    self._initial_multiplier_radius = initial_multiplier_radius
+
+  def _initial_state(self, num_constraints):
+    # For a MultiplicativeSwapRegretOptimizer, the internal state is a tensor of
+    # shape (m+1,m+1), where m is the number of constraints, representing the
+    # element-wise logarithm of a left-stochastic matrix.
+    dimension = num_constraints + 1
+    # Initialize by putting as much weight as possible on the objective, and as
+    # little as possible on the constraints.
+    log_initial_one = math.log(1.0 - (self._initial_multiplier_radius *
+                                      (dimension - 1) / (dimension)))
+    log_initial_zero = math.log(self._initial_multiplier_radius / dimension)
+    return standard_ops.concat(
+        (standard_ops.constant(
+            log_initial_one, dtype=dtypes.float32, shape=(1, dimension)),
+         standard_ops.constant(
+             log_initial_zero,
+             dtype=dtypes.float32,
+             shape=(dimension - 1, dimension))),
+        axis=0)
+
+  def _stochastic_matrix(self, state):
+    return standard_ops.exp(state)
+
+  def _constraint_grad_and_var(self, state, gradient):
+    # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True?
+    return (-gradient, state)
+
+  def _projection_op(self, state, name=None):
+    with ops.colocate_with(state):
+      # Gets the dimension of the state (num_constraints + 1)--all of these
+      # assertions are of things that should be impossible, since the state
+      # passed into this method will have the same shape as that returned by
+      # _initial_state().
+      state_shape = state.get_shape()
+      assert state_shape is not None
+      assert state_shape.ndims == 2
+      assert state_shape[0] == state_shape[1]
+      dimension = state_shape[0].value
+      assert dimension is not None
+
+      minimum_log_multiplier = standard_ops.log(
+          self._minimum_multiplier_radius / standard_ops.to_float(dimension))
+
+      return state_ops.assign(
+          state,
+          standard_ops.maximum(
+              _project_log_stochastic_matrix_wrt_kl_divergence(state),
+              minimum_log_multiplier),
+          name=name)
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
new file mode 100644
index 00000000000..34c4543dca9
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py
@@ -0,0 +1,212 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for constrained_optimization.python.swap_regret_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.constrained_optimization.python import swap_regret_optimizer
+from tensorflow.contrib.constrained_optimization.python import test_util
+
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+class AdditiveSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.AdditiveSwapRegretOptimizer):
+  """Testing wrapper class around AdditiveSwapRegretOptimizer.
+
+  This class is identical to AdditiveSwapRegretOptimizer, except that it caches
+  the internal optimization state when _stochastic_matrix() is called, so that
+  we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self, optimizer, constraint_optimizer=None):
+    """Same as AdditiveSwapRegretOptimizer.__init__()."""
+    super(AdditiveSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer, constraint_optimizer=constraint_optimizer)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(AdditiveSwapRegretOptimizerWrapper,
+                                           self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class MultiplicativeSwapRegretOptimizerWrapper(
+    swap_regret_optimizer.MultiplicativeSwapRegretOptimizer):
+  """Testing wrapper class around MultiplicativeSwapRegretOptimizer.
+
+  This class is identical to MultiplicativeSwapRegretOptimizer, except that it
+  caches the internal optimization state when _stochastic_matrix() is called, so
+  that we can test that the stochastic matrices take on their expected values.
+  """
+
+  def __init__(self,
+               optimizer,
+               constraint_optimizer=None,
+               minimum_multiplier_radius=None,
+               initial_multiplier_radius=None):
+    """Same as MultiplicativeSwapRegretOptimizer.__init__()."""
+    super(MultiplicativeSwapRegretOptimizerWrapper, self).__init__(
+        optimizer=optimizer,
+        constraint_optimizer=constraint_optimizer,
+        minimum_multiplier_radius=1e-3,
+        initial_multiplier_radius=initial_multiplier_radius)
+    self._cached_stochastic_matrix = None
+
+  @property
+  def stochastic_matrix(self):
+    """Returns the cached stochastic matrix."""
+    return self._cached_stochastic_matrix
+
+  def _stochastic_matrix(self, state):
+    """Caches the internal state for testing."""
+    self._cached_stochastic_matrix = super(
+        MultiplicativeSwapRegretOptimizerWrapper,
+        self)._stochastic_matrix(state)
+    return self._cached_stochastic_matrix
+
+
+class SwapRegretOptimizerTest(test.TestCase):
+
+  def test_maximum_eigenvector_power_method(self):
+    """Tests power method routine on some known left-stochastic matrices."""
+    matrix1 = np.matrix([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], [0.4, 0.3, 0.0]])
+    matrix2 = np.matrix([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      eigenvector1 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix1)))
+      eigenvector2 = session.run(
+          swap_regret_optimizer._maximal_eigenvector_power_method(
+              standard_ops.constant(matrix2)))
+
+    # Check that eigenvector1 and eigenvector2 are eigenvectors of matrix1 and
+    # matrix2 (respectively) with associated eigenvalue 1.
+    matrix_eigenvector1 = np.tensordot(matrix1, eigenvector1, axes=1)
+    matrix_eigenvector2 = np.tensordot(matrix2, eigenvector2, axes=1)
+    self.assertAllClose(eigenvector1, matrix_eigenvector1, rtol=0, atol=1e-6)
+    self.assertAllClose(eigenvector2, matrix_eigenvector2, rtol=0, atol=1e-6)
+
+  def test_project_stochastic_matrix_wrt_euclidean_norm(self):
+    """Tests Euclidean projection routine on some known values."""
+    matrix = standard_ops.constant([[-0.1, -0.1, 0.4], [-0.8, 0.4, 1.2],
+                                    [-0.3, 0.1, 0.2]])
+    expected_projected_matrix = np.array([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9],
+                                          [0.4, 0.3, 0.0]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          swap_regret_optimizer._project_stochastic_matrix_wrt_euclidean_norm(
+              matrix))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_project_log_stochastic_matrix_wrt_kl_divergence(self):
+    """Tests KL-divergence projection routine on some known values."""
+    matrix = standard_ops.constant([[0.2, 0.8, 0.6], [0.1, 0.2, 1.5],
+                                    [0.2, 1.0, 0.9]])
+    expected_projected_matrix = np.array([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5],
+                                          [0.4, 0.5, 0.3]])
+
+    with self.test_session() as session:
+      projected_matrix = session.run(
+          standard_ops.exp(
+              swap_regret_optimizer.
+              _project_log_stochastic_matrix_wrt_kl_divergence(
+                  standard_ops.log(matrix))))
+
+    self.assertAllClose(
+        expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6)
+
+  def test_additive_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = AdditiveSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0))
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]),
+        np.array([[0.66666667, 1.0, 1.0, 1.0], [0.26666667, 0.0, 0.0, 0.0],
+                  [0.0, 0.0, 0.0, 0.0], [0.06666667, 0.0, 0.0, 0.0]]),
+        np.array([[0.41666667, 0.93333333, 1.0,
+                   0.98333333], [0.46666667, 0.05333333, 0.0,
+                                 0.01333333], [0.0, 0.0, 0.0, 0.0],
+                  [0.11666667, 0.01333333, 0.0, 0.00333333]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+  def test_multiplicative_swap_regret_optimizer(self):
+    """Tests that the stochastic matrices update as expected."""
+    minimization_problem = test_util.ConstantMinimizationProblem(
+        np.array([0.6, -0.1, 0.4]))
+    optimizer = MultiplicativeSwapRegretOptimizerWrapper(
+        gradient_descent.GradientDescentOptimizer(1.0),
+        initial_multiplier_radius=0.8)
+    train_op = optimizer.minimize_constrained(minimization_problem)
+
+    # Calculated using a numpy+python implementation of the algorithm.
+    expected_matrices = [
+        np.array([[0.4, 0.4, 0.4, 0.4], [0.2, 0.2, 0.2, 0.2],
+                  [0.2, 0.2, 0.2, 0.2], [0.2, 0.2, 0.2, 0.2]]),
+        np.array([[0.36999014, 0.38528351, 0.38528351, 0.38528351], [
+            0.23517483, 0.21720297, 0.21720297, 0.21720297
+        ], [0.17774131, 0.18882719, 0.18882719, 0.18882719],
+                  [0.21709373, 0.20868632, 0.20868632, 0.20868632]]),
+        np.array([[0.33972109, 0.36811863, 0.37118462, 0.36906575], [
+            0.27114826, 0.23738228, 0.23376693, 0.23626491
+        ], [0.15712313, 0.17641793, 0.17858959, 0.17708679],
+                  [0.23200752, 0.21808115, 0.21645886, 0.21758255]]),
+    ]
+
+    matrices = []
+    with self.test_session() as session:
+      session.run(standard_ops.global_variables_initializer())
+      while len(matrices) < len(expected_matrices):
+        matrices.append(session.run(optimizer.stochastic_matrix))
+        session.run(train_op)
+
+    for expected, actual in zip(expected_matrices, matrices):
+      self.assertAllClose(expected, actual, rtol=0, atol=1e-6)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/test_util.py b/tensorflow/contrib/constrained_optimization/python/test_util.py
new file mode 100644
index 00000000000..704b36ca4c9
--- /dev/null
+++ b/tensorflow/contrib/constrained_optimization/python/test_util.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains helpers used by tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.constrained_optimization.python import constrained_minimization_problem
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import standard_ops
+
+
+class ConstantMinimizationProblem(
+    constrained_minimization_problem.ConstrainedMinimizationProblem):
+  """A `ConstrainedMinimizationProblem` with constant constraint violations.
+
+  This minimization problem is intended for use in performing simple tests of
+  the Lagrange multiplier (or equivalent) update in the optimizers. There is a
+  one-element "dummy" model parameter, but it should be ignored.
+  """
+
+  def __init__(self, constraints):
+    """Constructs a new `ConstantMinimizationProblem'.
+
+    Args:
+      constraints: 1d numpy array, the constant constraint violations.
+
+    Returns:
+      A new `ConstantMinimizationProblem'.
+    """
+    # We make an fake 1-parameter linear objective so that we don't get a "no
+    # variables to optimize" error.
+    self._objective = standard_ops.Variable(0.0, dtype=dtypes.float32)
+    self._constraints = standard_ops.constant(constraints, dtype=dtypes.float32)
+
+  @property
+  def objective(self):
+    """Returns the objective function."""
+    return self._objective
+
+  @property
+  def constraints(self):
+    """Returns the constant constraint violations."""
+    return self._constraints
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 7b508f87ab7..677ea65edd9 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -63,6 +63,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
     "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
     "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
     "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",

From 762fa5f6ead8f662e5cc14420293cb369f2b9615 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar 
Date: Mon, 23 Apr 2018 15:57:16 -0700
Subject: [PATCH 0633/1734] FakeQuant operations before ReLUs (occurs after
 bypass nodes) aren't needed.

PiperOrigin-RevId: 193999591
---
 .../contrib/quantize/python/quantize.py       | 68 ++++++++++++-------
 .../quantize/python/quantize_graph_test.py    | 14 ----
 .../contrib/quantize/python/quantize_test.py  | 57 ++++++++++++----
 3 files changed, 87 insertions(+), 52 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index d2d0426d233..efc1a94b3c6 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -133,19 +133,27 @@ def Quantize(graph,
           bits=activation_bits,
           producer_scope=scope,
           consumer_scope=scope)
-      _InsertQuantOp(
-          add_context,
-          'add_quant',
-          layer_match.bypass_op,
-          input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
-          is_training,
-          moving_avg=True,
-          ema_decay=ema_decay,
-          quant_delay=quant_delay,
-          vars_collection=vars_collection,
-          bits=activation_bits,
-          producer_scope=scope,
-          consumer_scope=scope)
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.bypass_op.name)
+      else:
+        _InsertQuantOp(
+            add_context,
+            'add_quant',
+            layer_match.bypass_op,
+            input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope,
+            consumer_scope=scope)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
@@ -153,19 +161,27 @@ def Quantize(graph,
           r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
       # If `scope` is given, only quantize it if the producer is in the right
       # scope.
-      _InsertQuantOp(
-          post_activation_bypass_context,
-          'post_activation_bypass_quant',
-          layer_match.post_activation_bypass_op,
-          input_to_ops_map.ConsumerOperations(
-              layer_match.post_activation_bypass_op),
-          is_training,
-          moving_avg=True,
-          ema_decay=ema_decay,
-          quant_delay=quant_delay,
-          vars_collection=vars_collection,
-          bits=activation_bits,
-          producer_scope=scope)
+      # Make sure the op following this isn't an activation. In which case, we
+      # shouldn't quantize it, since the activation will be Fused into the
+      # Add at inference time.
+      consumers = input_to_ops_map.ConsumerOperations(
+          layer_match.post_activation_bypass_op)
+      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+        logging.info('Skipping %s, because its followed by an activation.',
+                     layer_match.post_activation_bypass_op.name)
+      else:
+        _InsertQuantOp(
+            post_activation_bypass_context,
+            'post_activation_bypass_quant',
+            layer_match.post_activation_bypass_op,
+            consumers,
+            is_training,
+            moving_avg=True,
+            ema_decay=ema_decay,
+            quant_delay=quant_delay,
+            vars_collection=vars_collection,
+            bits=activation_bits,
+            producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index caf8ff28d50..54faf582f15 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -113,20 +113,6 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
-  def testWithPreActivationBypass(self):
-    self._RunTestOverAllRewrites(self._TestWithPreActivationBypass)
-
-  def _TestWithPreActivationBypass(self, rewrite_fn):
-    # Tests that the default graph is correctly used when no args are provided
-    # to rewrite_fn.
-    with ops.Graph().as_default() as g:
-      self._ConvLayer(pre_activation_bypass=True, scope='scope1')
-      rewrite_fn()
-
-      op_names = [op.name for op in g.get_operations()]
-      self.assertTrue(
-          any('scope1/add_quant/' in name for name in op_names))
-
   def testWithPostActivationBypass(self):
     self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index d37c83d6839..5e479f39468 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -82,9 +82,22 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    # Scan through all FakeQuant operations, ensuring that the activation
+    # isn't in the consumers of the operation. Since activations are folded
+    # the preceding operation during inference, the FakeQuant operation after
+    # the activation is all that is needed.
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
 
   def testInsertQuantOpForAddAfterSeparableConv2d(self):
     self._RunTestOverParameters(
@@ -109,9 +122,20 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
-    add_quant = graph.get_operation_by_name('test/add_quant/' +
-                                            quantization_node_name)
-    self.assertEqual(add_quant.type, quantization_node_name)
+    conv_quant = graph.get_operation_by_name('test/test/conv_quant/' +
+                                             quantization_node_name)
+    self.assertEqual(conv_quant.type, quantization_node_name)
+
+    for op in graph.get_operations():
+      if op.type == quantization_node_name:
+        quant_op = graph.get_operation_by_name(op.name)
+        # Scan through all FakeQuant operations, ensuring that the activation
+        # identity op isn't in the consumers of the operation.
+        consumers = []
+        for output in quant_op.outputs:
+          consumers.extend(output.consumers())
+
+        self.assertNotIn('test/identity', [c.name for c in consumers])
 
   def testFinalLayerQuantized(self):
     self._RunTestOverParameters(self._TestFinalLayerQuantized)
@@ -153,12 +177,21 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           activation_fn=array_ops.identity,
           scope='test/test')
       bypass_tensor = math_ops.add(conv, input2, name='test/add')
-      _ = array_ops.identity(bypass_tensor, name='test/output')
+      # The output of the post_activation bypass will be another layer.
+      _ = conv2d(
+          bypass_tensor,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=array_ops.identity,
+          scope='test/unused')
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
-      # Ensure that the bypass node is preceded and followed by
-      # FakeQuantWithMinMaxVars operations.
+      # Ensure that the bypass node is preceded by and followed by a
+      # FakeQuantWithMinMaxVar operation, since the output of the Add isn't an
+      # activation.
       self.assertTrue('FakeQuantWithMinMaxVars' in
                       [c.type for c in bypass_tensor.consumers()])
       self.assertTrue('FakeQuantWithMinMaxVars' in
@@ -198,9 +231,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
-      # Ensure that the bypass node is preceded and followed by
-      # FakeQuantWithMinMaxVars operations.
-      self.assertTrue('FakeQuantWithMinMaxVars' in
+      # Ensure that the bypass node is preceded by a FakeQuantWithMinMaxVar
+      # operation, and NOT followed by one.
+      self.assertTrue('FakeQuantWithMinMaxVars' not in
                       [c.type for c in bypass_tensor.consumers()])
       self.assertTrue('FakeQuantWithMinMaxVars' in
                       [i.op.type for i in bypass_tensor.op.inputs])

From 5809ad4436863ac82279c66d6cff6a4bffd77878 Mon Sep 17 00:00:00 2001
From: Francois Chollet 
Date: Mon, 23 Apr 2018 16:27:00 -0700
Subject: [PATCH 0634/1734] Add `static_state_saving_rnn` back to the `nn`
 module.

PiperOrigin-RevId: 194003971
---
 tensorflow/python/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 13f8420a670..c1702ae13c2 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -160,6 +160,7 @@ nn.dynamic_rnn = rnn.dynamic_rnn
 nn.static_rnn = rnn.static_rnn
 nn.raw_rnn = rnn.raw_rnn
 nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
+nn.static_state_saving_rnn = rnn.static_state_saving_rnn
 nn.rnn_cell = rnn_cell
 
 # Symbols whitelisted for export without documentation.

From ba39780114c648445d3285550bf7f5c1e9e8a251 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 16:29:27 -0700
Subject: [PATCH 0635/1734] Avoid inlining the split handler functions as it
 slows down the trainer startup significantly.

PiperOrigin-RevId: 194004319
---
 .../learner/batch/ordinal_split_handler.py    | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 7df514cd207..9d6cc9245aa 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -417,9 +417,18 @@ class SparseSplitHandler(InequalitySplitHandler):
     return (are_splits_ready, partition_ids, gains, split_infos)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32,
-                dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32,
-                dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
@@ -452,9 +461,20 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
           gradients, hessians)
 
 
-@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32,
-                dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32,
-                dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32)
+@function.Defun(
+    dtypes.bool,
+    dtypes.bool,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.int32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    dtypes.float32,
+    noinline=True)
 def sparse_make_stats_update(
     is_active, are_buckets_ready, sparse_column_indices, sparse_column_values,
     sparse_column_shape, quantile_buckets, example_partition_ids, gradients,

From a72155d58726d4dbb92d5d6b0f3290976bbdaa1c Mon Sep 17 00:00:00 2001
From: Alexandre Passos 
Date: Mon, 23 Apr 2018 16:33:27 -0700
Subject: [PATCH 0636/1734] Small fast path for binary_op_wrapper

PiperOrigin-RevId: 194004866
---
 tensorflow/python/ops/math_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 01d670ea2d9..2b04866fef4 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -965,7 +965,9 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
 
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
-      if not isinstance(y, sparse_tensor.SparseTensor):
+      if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
+        return func(x, y, name=name)
+      elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
           y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
         except TypeError:

From 84c73c2b4d0318bfd78a53ab6051169795604650 Mon Sep 17 00:00:00 2001
From: Allen Lavoie 
Date: Mon, 23 Apr 2018 16:46:41 -0700
Subject: [PATCH 0637/1734] TFTS: Support exogenous features in ARRegressor

They get flattened with the endogenous features as input to the model. Unlike
endogenous features, they're specified for the whole window when making
predictions.

Adds an ARRegressor example which uses exogenous features.

PiperOrigin-RevId: 194006630
---
 .../timeseries/examples/known_anomaly.py      |  75 +++++---
 .../timeseries/examples/known_anomaly_test.py |  18 +-
 .../timeseries/python/timeseries/ar_model.py  | 173 ++++++++++++++----
 .../python/timeseries/ar_model_test.py        |   8 +-
 .../python/timeseries/estimators.py           |  11 +-
 .../python/timeseries/estimators_test.py      |  48 +++--
 6 files changed, 255 insertions(+), 78 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py
index e77628ddd39..71621abc719 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py
@@ -41,17 +41,8 @@ _MODULE_PATH = path.dirname(__file__)
 _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv")
 
 
-def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
-  """Training, evaluating, and predicting on a series with changepoints."""
-
-  # Indicate the format of our exogenous feature, in this case a string
-  # representing a boolean value.
-  string_feature = tf.feature_column.categorical_column_with_vocabulary_list(
-      key="is_changepoint", vocabulary_list=["no", "yes"])
-  # Specify the way this feature is presented to the model, here using a one-hot
-  # encoding.
-  one_hot_feature = tf.feature_column.indicator_column(
-      categorical_column=string_feature)
+def state_space_esitmator(exogenous_feature_columns):
+  """Constructs a StructuralEnsembleRegressor."""
 
   def _exogenous_update_condition(times, features):
     del times  # unused
@@ -62,14 +53,48 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
     # no changepoint.
     return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes")
 
-  estimator = tf.contrib.timeseries.StructuralEnsembleRegressor(
-      periodicities=12,
-      # Extract a smooth period by constraining the number of latent values
-      # being cycled between.
-      cycle_num_latent_values=3,
-      num_features=1,
-      exogenous_feature_columns=[one_hot_feature],
-      exogenous_update_condition=_exogenous_update_condition)
+  return (
+      tf.contrib.timeseries.StructuralEnsembleRegressor(
+          periodicities=12,
+          # Extract a smooth period by constraining the number of latent values
+          # being cycled between.
+          cycle_num_latent_values=3,
+          num_features=1,
+          exogenous_feature_columns=exogenous_feature_columns,
+          exogenous_update_condition=_exogenous_update_condition),
+      # Use truncated backpropagation with a window size of 64, batching
+      # together 4 of these windows (random offsets) per training step. Training
+      # with exogenous features often requires somewhat larger windows.
+      4, 64)
+
+
+def autoregressive_esitmator(exogenous_feature_columns):
+  input_window_size = 8
+  output_window_size = 2
+  return (
+      tf.contrib.timeseries.ARRegressor(
+          periodicities=12,
+          num_features=1,
+          input_window_size=input_window_size,
+          output_window_size=output_window_size,
+          exogenous_feature_columns=exogenous_feature_columns),
+      64, input_window_size + output_window_size)
+
+
+def train_and_evaluate_exogenous(
+    estimator_fn, csv_file_name=_DATA_FILE, train_steps=300):
+  """Training, evaluating, and predicting on a series with changepoints."""
+  # Indicate the format of our exogenous feature, in this case a string
+  # representing a boolean value.
+  string_feature = tf.feature_column.categorical_column_with_vocabulary_list(
+      key="is_changepoint", vocabulary_list=["no", "yes"])
+  # Specify the way this feature is presented to the model, here using a one-hot
+  # encoding.
+  one_hot_feature = tf.feature_column.indicator_column(
+      categorical_column=string_feature)
+
+  estimator, batch_size, window_size = estimator_fn(
+      exogenous_feature_columns=[one_hot_feature])
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       # Indicate the format of our CSV file. First we have two standard columns,
@@ -85,10 +110,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300):
       # This CSV has a header line; here we just ignore it.
       skip_header_lines=1)
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
-      # Use truncated backpropagation with a window size of 64, batching
-      # together 4 of these windows (random offsets) per training step. Training
-      # with exogenous features often requires somewhat larger windows.
-      reader, batch_size=4, window_size=64)
+      reader, batch_size=batch_size, window_size=window_size)
   estimator.train(input_fn=train_input_fn, steps=train_steps)
   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
@@ -145,7 +167,12 @@ def main(unused_argv):
   if not HAS_MATPLOTLIB:
     raise ImportError(
         "Please install matplotlib to generate a plot from this example.")
-  make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous())
+  make_plot("Ignoring a known anomaly (state space)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=state_space_esitmator))
+  make_plot("Ignoring a known anomaly (autoregressive)",
+            *train_and_evaluate_exogenous(
+                estimator_fn=autoregressive_esitmator, train_steps=3000))
   pyplot.show()
 
 
diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
index c3e307cad81..8c64f2e186a 100644
--- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
+++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py
@@ -23,12 +23,24 @@ from tensorflow.contrib.timeseries.examples import known_anomaly
 from tensorflow.python.platform import test
 
 
-class KnownAnaomalyExampleTest(test.TestCase):
+class KnownAnomalyExampleTest(test.TestCase):
 
-  def test_shapes_and_variance_structural(self):
+  def test_shapes_and_variance_structural_ar(self):
     (times, observed, all_times, mean, upper_limit, lower_limit,
      anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
-         train_steps=50)
+         train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator)
+    self.assertAllEqual(
+        anomaly_locations,
+        [25, 50, 75, 100, 125, 150, 175, 249])
+    self.assertAllEqual(all_times.shape, mean.shape)
+    self.assertAllEqual(all_times.shape, upper_limit.shape)
+    self.assertAllEqual(all_times.shape, lower_limit.shape)
+    self.assertAllEqual(times.shape, observed.shape)
+
+  def test_shapes_and_variance_structural_ssm(self):
+    (times, observed, all_times, mean, upper_limit, lower_limit,
+     anomaly_locations) = known_anomaly.train_and_evaluate_exogenous(
+         train_steps=50, estimator_fn=known_anomaly.state_space_esitmator)
     self.assertAllEqual(
         anomaly_locations,
         [25, 50, 75, 100, 125, 150, 175, 249])
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index 4f6527a5465..558d9480b49 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -60,7 +60,8 @@ class ARModel(model.TimeSeriesModel):
                num_features,
                num_time_buckets=10,
                loss=NORMAL_LIKELIHOOD_LOSS,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     """Constructs an auto-regressive model.
 
     Args:
@@ -81,6 +82,11 @@ class ARModel(model.TimeSeriesModel):
         observations and predictions, while the training loss is computed on
         normalized data (if input statistics are available).
       hidden_layer_sizes: list of sizes of hidden layers.
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+          `tf.feature_column.embedding_column`) corresponding to exogenous
+          features which provide extra information to the model but are not part
+          of the series to be predicted. Passed to
+          `tf.feature_column.input_layer`.
     """
     self.input_window_size = input_window_size
     self.output_window_size = output_window_size
@@ -90,7 +96,12 @@ class ARModel(model.TimeSeriesModel):
     self.window_size = self.input_window_size + self.output_window_size
     self.loss = loss
     super(ARModel, self).__init__(
-        num_features=num_features)
+        num_features=num_features,
+        exogenous_feature_columns=exogenous_feature_columns)
+    if exogenous_feature_columns is not None:
+      self.exogenous_size = self._get_exogenous_embedding_shape()[-1]
+    else:
+      self.exogenous_size = 0
     assert num_time_buckets > 0
     self._buckets = int(num_time_buckets)
     if periodicities is None or not periodicities:
@@ -110,7 +121,10 @@ class ARModel(model.TimeSeriesModel):
     # that the serving input_receiver_fn gets placeholder shapes correct.
     return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64),
             array_ops.zeros(
-                [self.input_window_size, self.num_features], dtype=self.dtype))
+                [self.input_window_size, self.num_features], dtype=self.dtype),
+            array_ops.zeros(
+                [self.input_window_size, self.exogenous_size],
+                dtype=self.dtype))
 
   # TODO(allenl,agarwal): Support sampling for AR.
   def random_model_parameters(self, seed=None):
@@ -163,7 +177,7 @@ class ARModel(model.TimeSeriesModel):
       activations.append((activation, activation_size))
     return activations
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     """Compute model predictions given input data.
 
     Args:
@@ -173,6 +187,8 @@ class ARModel(model.TimeSeriesModel):
           prediction times.
       values: A [batch size, self.input_window_size, self.num_features] Tensor
           with input features.
+      exogenous_regressors: A [batch size, self.window_size,
+          self.exogenous_size] Tensor with exogenous features.
     Returns:
       Tuple (predicted_mean, predicted_covariance), where each element is a
       Tensor with shape [batch size, self.output_window_size,
@@ -183,25 +199,33 @@ class ARModel(model.TimeSeriesModel):
     if self.input_window_size:
       values.get_shape().assert_is_compatible_with(
           [None, self.input_window_size, self.num_features])
+    if exogenous_regressors is not None:
+      exogenous_regressors.get_shape().assert_is_compatible_with(
+          [None, self.window_size, self.exogenous_size])
     # Create input features.
+    activation_components = []
     if self._periods:
       _, time_features = self._compute_time_features(times)
       activation_size = self.window_size * self._buckets * len(self._periods)
-      activation = array_ops.reshape(time_features, [-1, activation_size])
+      activation_components.append(
+          array_ops.reshape(time_features, [-1, activation_size]))
     else:
       activation_size = 0
-      activation = None
-
     if self.input_window_size:
       inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1])
       inp_size = self.input_window_size * self.num_features
       inp = array_ops.reshape(inp, [-1, inp_size])
-      if activation is not None:
-        activation = array_ops.concat([inp, activation], 1)
-      else:
-        activation = inp
+      activation_components.append(inp)
       activation_size += inp_size
+    if self.exogenous_size:
+      exogenous_size = self.window_size * self.exogenous_size
+      activation_size += exogenous_size
+      exogenous_flattened = array_ops.reshape(
+          exogenous_regressors, [-1, exogenous_size])
+      activation_components.append(exogenous_flattened)
     assert activation_size
+    assert activation_components
+    activation = array_ops.concat(activation_components, axis=1)
     activations.append((activation, activation_size))
     # Create hidden layers.
     activations += self._create_hidden_stack(activation, activation_size)
@@ -228,6 +252,19 @@ class ARModel(model.TimeSeriesModel):
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
 
+  def _process_exogenous_features(self, times, features):
+    embedded = super(ARModel, self)._process_exogenous_features(
+        times=times, features=features)
+    if embedded is None:
+      assert self.exogenous_size == 0
+      # No embeddings. Return a zero-size [batch, times, 0] array so we don't
+      # have to special case it downstream.
+      return array_ops.zeros(
+          array_ops.concat([array_ops.shape(times), constant_op.constant([0])],
+                           axis=0))
+    else:
+      return embedded
+
   # TODO(allenl, agarwal): Consider better ways of warm-starting predictions.
   def predict(self, features):
     """Computes predictions multiple steps into the future.
@@ -243,6 +280,7 @@ class ARModel(model.TimeSeriesModel):
           segment of the time series before `TIMES`. This data is used
           to start of the autoregressive computation. This should have data for
           at least self.input_window_size timesteps.
+        And any exogenous features, with shapes prefixed by shape of `TIMES`.
     Returns:
       A dictionary with keys, "mean", "covariance". The
       values are Tensors of shape [batch_size, predict window size,
@@ -250,25 +288,39 @@ class ARModel(model.TimeSeriesModel):
     """
     predict_times = math_ops.cast(
         ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32)
+    exogenous_regressors = self._process_exogenous_features(
+        times=predict_times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
+    with ops.control_dependencies(
+        [check_ops.assert_equal(array_ops.shape(predict_times)[1],
+                                array_ops.shape(exogenous_regressors)[1])]):
+      exogenous_regressors = array_ops.identity(exogenous_regressors)
     batch_size = array_ops.shape(predict_times)[0]
     num_predict_values = array_ops.shape(predict_times)[1]
     prediction_iterations = ((num_predict_values + self.output_window_size - 1)
                              // self.output_window_size)
-    # Pad predict_times so as to have exact multiple of self.output_window_size
-    # values per example.
+    # Pad predict_times and exogenous regressors so as to have exact multiple of
+    # self.output_window_size values per example.
     padding_size = (prediction_iterations * self.output_window_size -
                     num_predict_values)
-    padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype)
-    predict_times = control_flow_ops.cond(
-        padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1),
-        lambda: predict_times)
+    predict_times = array_ops.pad(
+        predict_times, [[0, 0], [0, padding_size]])
+    exogenous_regressors = array_ops.pad(
+        exogenous_regressors, [[0, 0], [0, padding_size], [0, 0]])
     state = features[PredictionFeatures.STATE_TUPLE]
-    (state_times, state_values) = state
+    (state_times, state_values, state_exogenous_regressors) = state
     state_times = math_ops.cast(
         ops.convert_to_tensor(state_times), dtypes.int32)
     state_values = ops.convert_to_tensor(state_values, dtype=self.dtype)
+    state_exogenous_regressors = ops.convert_to_tensor(
+        state_exogenous_regressors, dtype=self.dtype)
 
     initial_input_times = predict_times[:, :self.output_window_size]
+    initial_input_exogenous_regressors = (
+        exogenous_regressors[:, :self.output_window_size, :])
     if self.input_window_size > 0:
       initial_input_times = array_ops.concat(
           [state_times[:, -self.input_window_size:], initial_input_times], 1)
@@ -279,6 +331,11 @@ class ARModel(model.TimeSeriesModel):
           check_ops.assert_equal(values_size, times_size)
       ]):
         initial_input_values = state_values[:, -self.input_window_size:, :]
+        initial_input_exogenous_regressors = array_ops.concat(
+            [state_exogenous_regressors[:, -self.input_window_size:, :],
+             initial_input_exogenous_regressors[
+                 :, :self.output_window_size, :]],
+            axis=1)
     else:
       initial_input_values = 0
 
@@ -288,9 +345,10 @@ class ARModel(model.TimeSeriesModel):
       return math_ops.less(iteration_number, prediction_iterations)
 
     def _while_body(iteration_number, input_times, input_values,
-                    mean_ta, covariance_ta):
+                    input_exogenous_regressors, mean_ta, covariance_ta):
       """Predict self.output_window_size values."""
-      prediction_ops = self.prediction_ops(input_times, input_values)
+      prediction_ops = self.prediction_ops(
+          input_times, input_values, input_exogenous_regressors)
       predicted_mean = prediction_ops["mean"]
       predicted_covariance = prediction_ops["covariance"]
       offset = self.output_window_size * gen_math_ops.minimum(
@@ -299,20 +357,33 @@ class ARModel(model.TimeSeriesModel):
         if self.output_window_size < self.input_window_size:
           new_input_values = array_ops.concat(
               [input_values[:, self.output_window_size:, :], predicted_mean], 1)
+          new_input_exogenous_regressors = array_ops.concat(
+              [input_exogenous_regressors[:, -self.input_window_size:, :],
+               exogenous_regressors[
+                   :, offset:offset + self.output_window_size, :]],
+              axis=1)
           new_input_times = array_ops.concat([
-              input_times[:, self.output_window_size:],
+              input_times[:, -self.input_window_size:],
               predict_times[:, offset:offset + self.output_window_size]
           ], 1)
         else:
           new_input_values = predicted_mean[:, -self.input_window_size:, :]
+          new_input_exogenous_regressors = exogenous_regressors[
+              :,
+              offset - self.input_window_size:offset + self.output_window_size,
+              :]
           new_input_times = predict_times[
               :,
               offset - self.input_window_size:offset + self.output_window_size]
       else:
         new_input_values = input_values
+        new_input_exogenous_regressors = exogenous_regressors[
+            :, offset:offset + self.output_window_size, :]
         new_input_times = predict_times[:,
                                         offset:offset + self.output_window_size]
       new_input_times.set_shape(initial_input_times.get_shape())
+      new_input_exogenous_regressors.set_shape(
+          initial_input_exogenous_regressors.get_shape())
       new_mean_ta = mean_ta.write(iteration_number, predicted_mean)
       if isinstance(covariance_ta, tensor_array_ops.TensorArray):
         new_covariance_ta = covariance_ta.write(iteration_number,
@@ -322,6 +393,7 @@ class ARModel(model.TimeSeriesModel):
       return (iteration_number + 1,
               new_input_times,
               new_input_values,
+              new_input_exogenous_regressors,
               new_mean_ta,
               new_covariance_ta)
 
@@ -332,9 +404,13 @@ class ARModel(model.TimeSeriesModel):
                           if self.loss != ARModel.SQUARED_LOSS else 0.)
     mean_ta_init = tensor_array_ops.TensorArray(
         dtype=self.dtype, size=prediction_iterations)
-    _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
+    _, _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop(
         _while_condition, _while_body, [
-            0, initial_input_times, initial_input_values, mean_ta_init,
+            0,
+            initial_input_times,
+            initial_input_values,
+            initial_input_exogenous_regressors,
+            mean_ta_init,
             covariance_ta_init
         ])
 
@@ -366,11 +442,11 @@ class ARModel(model.TimeSeriesModel):
     return {"mean": predicted_mean,
             "covariance": predicted_covariance}
 
-  def _process_window(self, features, mode):
+  def _process_window(self, features, mode, exogenous_regressors):
     """Compute model outputs on a single window of data."""
-    # TODO(agarwal): Use exogenous features
     times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64)
     values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype)
+    exogenous_regressors = math_ops.cast(exogenous_regressors, dtype=self.dtype)
     original_values = values
 
     # Extra shape checking for the window size (above that in
@@ -395,7 +471,8 @@ class ARModel(model.TimeSeriesModel):
       input_values = values[:, :self.input_window_size, :]
     else:
       input_values = None
-    prediction_ops = self.prediction_ops(times, input_values)
+    prediction_ops = self.prediction_ops(
+        times, input_values, exogenous_regressors)
     prediction = prediction_ops["mean"]
     covariance = prediction_ops["covariance"]
     targets = array_ops.slice(values, [0, self.input_window_size, 0],
@@ -419,7 +496,8 @@ class ARModel(model.TimeSeriesModel):
     return model.ModelOutputs(
         loss=loss,
         end_state=(times[:, -self.input_window_size:],
-                   values[:, -self.input_window_size:, :]),
+                   values[:, -self.input_window_size:, :],
+                   exogenous_regressors[:, -self.input_window_size:, :]),
         predictions={"mean": prediction, "covariance": covariance,
                      "observed": original_values[:, -self.output_window_size:]},
         prediction_times=times[:, -self.output_window_size:])
@@ -454,17 +532,24 @@ class ARModel(model.TimeSeriesModel):
     """
     features = {feature_name: ops.convert_to_tensor(feature_value)
                 for feature_name, feature_value in features.items()}
+    times = features[TrainEvalFeatures.TIMES]
+    exogenous_regressors = self._process_exogenous_features(
+        times=times,
+        features={key: value for key, value in features.items()
+                  if key not in [TrainEvalFeatures.TIMES,
+                                 TrainEvalFeatures.VALUES,
+                                 PredictionFeatures.STATE_TUPLE]})
     if mode == estimator_lib.ModeKeys.TRAIN:
       # For training, we require the window size to be self.window_size as
       # iterating sequentially on larger windows could introduce a bias.
-      return self._process_window(features, mode=mode)
+      return self._process_window(
+          features, mode=mode, exogenous_regressors=exogenous_regressors)
     elif mode == estimator_lib.ModeKeys.EVAL:
       # For evaluation, we allow the user to pass in a larger window, in which
       # case we try to cover as much of the window as possible without
       # overlap. Quantitative evaluation is more efficient/correct with fixed
       # windows matching self.window_size (as with training), but this looping
       # allows easy plotting of "in-sample" predictions.
-      times = features[TrainEvalFeatures.TIMES]
       times.get_shape().assert_has_rank(2)
       static_window_size = times.get_shape()[1].value
       if (static_window_size is not None
@@ -500,7 +585,9 @@ class ARModel(model.TimeSeriesModel):
                 feature_name:
                 feature_value[:, base_offset:base_offset + self.window_size]
                 for feature_name, feature_value in features.items()},
-            mode=mode)
+            mode=mode,
+            exogenous_regressors=exogenous_regressors[
+                :, base_offset:base_offset + self.window_size])
         # This code needs to be updated if new predictions are added in
         # self._process_window
         assert len(model_outputs.predictions) == 3
@@ -525,7 +612,9 @@ class ARModel(model.TimeSeriesModel):
       batch_size = array_ops.shape(times)[0]
       prediction_shape = [batch_size, self.output_window_size * num_iterations,
                           self.num_features]
-      previous_state_times, previous_state_values = state
+      (previous_state_times,
+       previous_state_values,
+       previous_state_exogenous_regressors) = state
       # Make sure returned state always has windows of self.input_window_size,
       # even if we were passed fewer than self.input_window_size points this
       # time.
@@ -540,14 +629,24 @@ class ARModel(model.TimeSeriesModel):
              self._scale_data(values)], axis=1)[:, -self.input_window_size:, :]
         new_state_values.set_shape((None, self.input_window_size,
                                     self.num_features))
+        new_exogenous_regressors = array_ops.concat(
+            [previous_state_exogenous_regressors,
+             exogenous_regressors], axis=1)[:, -self.input_window_size:, :]
+        new_exogenous_regressors.set_shape(
+            (None,
+             self.input_window_size,
+             self.exogenous_size))
       else:
         # There is no state to keep, and the strided slices above do not handle
         # input_window_size=0.
         new_state_times = previous_state_times
         new_state_values = previous_state_values
+        new_exogenous_regressors = previous_state_exogenous_regressors
       return model.ModelOutputs(
           loss=math_ops.reduce_mean(loss_ta.stack(), axis=0),
-          end_state=(new_state_times, new_state_values),
+          end_state=(new_state_times,
+                     new_state_values,
+                     new_exogenous_regressors),
           predictions={
               "mean": array_ops.reshape(
                   array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]),
@@ -604,7 +703,8 @@ class AnomalyMixtureARModel(ARModel):
                num_features,
                anomaly_distribution=GAUSSIAN_ANOMALY,
                num_time_buckets=10,
-               hidden_layer_sizes=None):
+               hidden_layer_sizes=None,
+               exogenous_feature_columns=None):
     assert (anomaly_prior_probability < 1.0 and
             anomaly_prior_probability > 0.0)
     self._anomaly_prior_probability = anomaly_prior_probability
@@ -619,7 +719,8 @@ class AnomalyMixtureARModel(ARModel):
         input_window_size=input_window_size,
         output_window_size=output_window_size,
         loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        hidden_layer_sizes=hidden_layer_sizes)
+        hidden_layer_sizes=hidden_layer_sizes,
+        exogenous_feature_columns=exogenous_feature_columns)
 
   def _create_anomaly_ops(self, times, values, prediction_ops_dict):
     anomaly_log_param = variable_scope.get_variable(
@@ -631,9 +732,9 @@ class AnomalyMixtureARModel(ARModel):
     # distribution.
     prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param)
 
-  def prediction_ops(self, times, values):
+  def prediction_ops(self, times, values, exogenous_regressors):
     prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops(
-        times, values)
+        times, values, exogenous_regressors)
     self._create_anomaly_ops(times, values, prediction_ops_dict)
     return prediction_ops_dict
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index 1e1ca4e77fc..d078ac8d463 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -155,12 +155,15 @@ class ARModelTest(test.TestCase):
     state_times = np.expand_dims(train_data_times[:input_window_size], 0)
     state_values = np.expand_dims(
         train_data_values[:input_window_size, :], 0)
+    state_exogenous = state_times[:, :, None][:, :, :0]
 
     def prediction_input_fn():
       return ({
           PredictionFeatures.TIMES: training.limit_epochs(
               predict_times, num_epochs=1),
-          PredictionFeatures.STATE_TUPLE: (state_times, state_values)
+          PredictionFeatures.STATE_TUPLE: (state_times,
+                                           state_values,
+                                           state_exogenous)
       }, {})
     (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn))
     predicted_mean = predictions["mean"][:, 0]
@@ -246,7 +249,8 @@ class ARModelTest(test.TestCase):
       with session.Session():
         predicted_values = model.predict({
             PredictionFeatures.TIMES: [[4, 6, 10]],
-            PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]])
+            PredictionFeatures.STATE_TUPLE: (
+                [[1, 2]], [[[1.], [2.]]], [[[], []]])
         })
         variables.global_variables_initializer().run()
         self.assertAllEqual(predicted_values["mean"].eval().shape,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 886e1846e2a..f4608ca2d1c 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -190,7 +190,7 @@ class ARRegressor(TimeSeriesRegressor):
 
   def __init__(
       self, periodicities, input_window_size, output_window_size,
-      num_features, num_time_buckets=10,
+      num_features, exogenous_feature_columns=None, num_time_buckets=10,
       loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None,
       anomaly_prior_probability=None, anomaly_distribution=None,
       optimizer=None, model_dir=None, config=None):
@@ -205,7 +205,12 @@ class ARRegressor(TimeSeriesRegressor):
       output_window_size: Number of future time steps to predict. Note that
         setting it to > 1 empirically seems to give a better fit.
       num_features: The dimensionality of the time series (one for univariate,
-          more than one for multivariate).
+        more than one for multivariate).
+      exogenous_feature_columns: A list of `tf.feature_column`s (for example
+        `tf.feature_column.embedding_column`) corresponding to exogenous
+        features which provide extra information to the model but are not part
+        of the series to be predicted. Passed to
+        `tf.feature_column.input_layer`.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
       loss: Loss function to use for training. Currently supported values are
@@ -241,6 +246,7 @@ class ARRegressor(TimeSeriesRegressor):
         anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY
       model = ar_model.ARModel(
           periodicities=periodicities, num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           input_window_size=input_window_size,
           output_window_size=output_window_size, loss=loss,
@@ -255,6 +261,7 @@ class ARRegressor(TimeSeriesRegressor):
           input_window_size=input_window_size,
           output_window_size=output_window_size,
           num_features=num_features,
+          exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           hidden_layer_sizes=hidden_layer_sizes,
           anomaly_prior_probability=anomaly_prior_probability,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 9f161c1695f..eebee053f8e 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -48,12 +49,17 @@ class TimeSeriesRegressorTest(test.TestCase):
   def _fit_restore_fit_test_template(self, estimator_fn, dtype):
     """Tests restoring previously fit models."""
     model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    first_estimator = estimator_fn(model_dir)
+    exogenous_feature_columns = (
+        feature_column.numeric_column("exogenous"),
+    )
+    first_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     times = numpy.arange(20, dtype=numpy.int64)
     values = numpy.arange(20, dtype=dtype.as_numpy_dtype)
+    exogenous = numpy.arange(20, dtype=dtype.as_numpy_dtype)
     features = {
         feature_keys.TrainEvalFeatures.TIMES: times,
-        feature_keys.TrainEvalFeatures.VALUES: values
+        feature_keys.TrainEvalFeatures.VALUES: values,
+        "exogenous": exogenous
     }
     train_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1,
@@ -68,14 +74,19 @@ class TimeSeriesRegressorTest(test.TestCase):
     first_loss_after_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
     self.assertLess(first_loss_after_fit, first_loss_before_fit)
-    second_estimator = estimator_fn(model_dir)
+    second_estimator = estimator_fn(model_dir, exogenous_feature_columns)
     second_estimator.train(input_fn=train_input_fn, steps=2)
     whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
         input_pipeline.NumpyReader(features))
     whole_dataset_evaluation = second_estimator.evaluate(
         input_fn=whole_dataset_input_fn, steps=1)
+    exogenous_values_ten_steps = {
+        "exogenous": numpy.arange(
+            10, dtype=dtype.as_numpy_dtype)[None, :, None]
+    }
     predict_input_fn = input_pipeline.predict_continuation_input_fn(
         evaluation=whole_dataset_evaluation,
+        exogenous_features=exogenous_values_ten_steps,
         steps=10)
     # Also tests that limit_epochs in predict_continuation_input_fn prevents
     # infinite iteration
@@ -92,6 +103,7 @@ class TimeSeriesRegressorTest(test.TestCase):
         saved_prediction = saved_model_utils.predict_continuation(
             continue_from=whole_dataset_evaluation,
             steps=10,
+            exogenous_features=exogenous_values_ten_steps,
             signatures=signatures,
             session=sess)
         # Saved model predictions should be the same as Estimator predictions
@@ -104,7 +116,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=whole_dataset_evaluation,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2,
-                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.
+                feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2.,
+                "exogenous": values[None, -1, None] + 12.
             },
             signatures=signatures,
             session=sess)
@@ -112,6 +125,10 @@ class TimeSeriesRegressorTest(test.TestCase):
         second_saved_prediction = saved_model_utils.predict_continuation(
             continue_from=first_filtering,
             steps=1,
+            exogenous_features={
+                "exogenous": numpy.arange(
+                    1, dtype=dtype.as_numpy_dtype)[None, :, None]
+            },
             signatures=signatures,
             session=sess)
         self.assertEqual(
@@ -122,7 +139,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             continue_from=first_filtering,
             features={
                 feature_keys.FilteringFeatures.TIMES: times[-1] + 3,
-                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.
+                feature_keys.FilteringFeatures.VALUES: values[-1] + 3.,
+                "exogenous": values[-1, None] + 13.
             },
             signatures=signatures,
             session=sess)
@@ -131,7 +149,8 @@ class TimeSeriesRegressorTest(test.TestCase):
         six.assertCountEqual(
             self,
             [feature_keys.FilteringFeatures.TIMES,
-             feature_keys.FilteringFeatures.VALUES],
+             feature_keys.FilteringFeatures.VALUES,
+             "exogenous"],
             signatures.signature_def[
                 feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
         batch_numpy_times = numpy.tile(
@@ -142,7 +161,8 @@ class TimeSeriesRegressorTest(test.TestCase):
             session=sess,
             features={
                 feature_keys.FilteringFeatures.TIMES: batch_numpy_times,
-                feature_keys.FilteringFeatures.VALUES: batch_numpy_values
+                feature_keys.FilteringFeatures.VALUES: batch_numpy_values,
+                "exogenous": 10. + batch_numpy_values
             }
         )
         predict_times = numpy.tile(
@@ -150,26 +170,32 @@ class TimeSeriesRegressorTest(test.TestCase):
         predictions = saved_model_utils.predict_continuation(
             continue_from=state,
             times=predict_times,
+            exogenous_features={
+                "exogenous": numpy.tile(numpy.arange(
+                    15, dtype=dtype.as_numpy_dtype), (10,))[None, :, None]
+            },
             signatures=signatures,
             session=sess)
         self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
 
   def test_fit_restore_fit_ar_regressor(self):
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.ARRegressor(
           periodicities=10, input_window_size=10, output_window_size=6,
           num_features=1, model_dir=model_dir, config=_SeedRunConfig(),
           # This test is flaky with normal likelihood loss (could add more
           # training iterations instead).
-          loss=ar_model.ARModel.SQUARED_LOSS)
+          loss=ar_model.ARModel.SQUARED_LOSS,
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
 
   def test_fit_restore_fit_structural_ensemble_regressor(self):
     dtype = dtypes.float32
-    def _estimator_fn(model_dir):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.StructuralEnsembleRegressor(
           num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype,
-          config=_SeedRunConfig())
+          config=_SeedRunConfig(),
+          exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype)
 
 

From a36e6edab33c7a5bef2f911d4d7bb88ffc8c7de6 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling 
Date: Mon, 23 Apr 2018 16:51:59 -0700
Subject: [PATCH 0638/1734] Handle missing params for a few ops in Toco using
 default values.

PiperOrigin-RevId: 194007329
---
 .../contrib/lite/toco/import_tensorflow.cc    | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 155d890c9f2..2ed05cb3720 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1093,8 +1093,10 @@ void ConvertMatMulOperator(const NodeDef& node,
 
   // Transpose flags should be easy to support, but we don't have a
   // GraphDef with them to test on at the moment.
-  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
-  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+  CHECK_EQ(HasAttr(node, "transpose_a") && GetBoolAttr(node, "transpose_a"),
+           false);
+  CHECK_EQ(HasAttr(node, "transpose_b") && GetBoolAttr(node, "transpose_b"),
+           false);
   CHECK(!HasAttr(node, "adjoint_a") ||
         (GetBoolAttr(node, "adjoint_a") == false));
   CHECK(!HasAttr(node, "adjoint_b") ||
@@ -1300,11 +1302,17 @@ void ConvertStridedSliceOperator(const NodeDef& node,
   }
   op->outputs.push_back(node.name());
 
-  op->begin_mask = GetIntAttr(node, "begin_mask");
-  op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask");
-  op->end_mask = GetIntAttr(node, "end_mask");
-  op->new_axis_mask = GetIntAttr(node, "new_axis_mask");
-  op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask");
+  op->begin_mask =
+      HasAttr(node, "begin_mask") ? GetIntAttr(node, "begin_mask") : 0;
+  op->ellipsis_mask =
+      HasAttr(node, "ellipsis_mask") ? GetIntAttr(node, "ellipsis_mask") : 0;
+  op->end_mask = HasAttr(node, "end_mask") ? GetIntAttr(node, "end_mask") : 0;
+  op->new_axis_mask =
+      HasAttr(node, "new_axis_mask") ? GetIntAttr(node, "new_axis_mask") : 0;
+  op->shrink_axis_mask = HasAttr(node, "shrink_axis_mask")
+                             ? GetIntAttr(node, "shrink_axis_mask")
+                             : 0;
+
   model->operators.emplace_back(op);
 }
 
@@ -1394,8 +1402,11 @@ void ConvertArgMaxOperator(const NodeDef& node,
                            Model* model) {
   CHECK_EQ(node.op(), "ArgMax");
   CheckInputsCount(node, tf_import_flags, 2);
-  const auto axis_data_type = GetDataTypeAttr(node, "Tidx");
-  const auto output_type = GetDataTypeAttr(node, "output_type");
+  const auto axis_data_type =
+      HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32;
+  const auto output_type = HasAttr(node, "output_type")
+                               ? GetDataTypeAttr(node, "output_type")
+                               : DT_INT64;
   CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32);
   CHECK(output_type == DT_INT64 || output_type == DT_INT32);
   auto* op = new ArgMaxOperator;
@@ -1772,7 +1783,7 @@ void ConvertStackOperator(const NodeDef& node,
     op->inputs.push_back(node.input(i));
   }
   // Both "Stack" and "Pack" have the "axis" attribute.
-  op->axis = GetIntAttr(node, "axis");
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }

From 771f7b46d631fa510658685d1b84ffbb22ffcd55 Mon Sep 17 00:00:00 2001
From: Nupur Garg 
Date: Mon, 23 Apr 2018 17:10:05 -0700
Subject: [PATCH 0639/1734] Improve TOCO SavedModel support.

PiperOrigin-RevId: 194009891
---
 tensorflow/contrib/lite/python/BUILD          |  45 +-
 tensorflow/contrib/lite/python/convert.py     | 187 ++++++++
 .../lite/python/convert_saved_model.py        | 415 ++++++++++++------
 .../lite/python/convert_saved_model_test.py   | 172 ++++++--
 .../convert_saved_model_to_frozen_graph.py    | 106 +++++
 .../python/{lite_test.py => convert_test.py}  |  41 +-
 tensorflow/contrib/lite/python/lite.py        | 204 +--------
 .../contrib/lite/python/lite_constants.py     |  53 +++
 8 files changed, 842 insertions(+), 381 deletions(-)
 create mode 100644 tensorflow/contrib/lite/python/convert.py
 create mode 100644 tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
 rename tensorflow/contrib/lite/python/{lite_test.py => convert_test.py} (82%)
 create mode 100644 tensorflow/contrib/lite/python/lite_constants.py

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 926896d609d..e6dcc7aa099 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -39,16 +39,35 @@ py_test(
 py_library(
     name = "lite",
     srcs = ["lite.py"],
-    # data = [
-    #     "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-    # ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":convert",
+        ":convert_saved_model",
         ":op_hint",
+    ],
+)
+
+py_library(
+    name = "lite_constants",
+    srcs = ["lite_constants.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
+    ],
+)
+
+py_library(
+    name = "convert",
+    srcs = ["convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite_constants",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         "//tensorflow/python:platform",
     ],
 )
@@ -66,15 +85,15 @@ py_library(
 )
 
 py_test(
-    name = "lite_test",
-    srcs = ["lite_test.py"],
+    name = "convert_test",
+    srcs = ["convert_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no-internal-py3",
         "no_oss",
     ],
     deps = [
-        ":lite",
+        ":convert",
         ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -84,13 +103,14 @@ py_test(
     ],
 )
 
-py_binary(
+py_library(
     name = "convert_saved_model",
     srcs = ["convert_saved_model.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":lite",
+        ":convert",
+        ":lite_constants",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python:graph_util",
         "//tensorflow/python/tools:freeze_graph_lib",
@@ -130,6 +150,15 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "convert_saved_model_to_frozen_graph",
+    srcs = ["convert_saved_model_to_frozen_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convert_saved_model",
+    ],
+)
+
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "tf_lite_py_pip",
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
new file mode 100644
index 00000000000..c4200c879ba
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts a frozen graph into a TFLite FlatBuffer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os as _os
+import subprocess as _subprocess
+import tempfile as _tempfile
+
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.contrib.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
+
+# Find the toco_from_protos binary using the resource loader if using from
+# bazel, otherwise we are in a pip where console_scripts already has
+# the toco_from_protos tool.
+if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
+  _toco_from_proto_bin = ""
+else:
+  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
+      "../toco/python/toco_from_protos")
+
+if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
+  _toco_from_proto_bin = "toco_from_protos"
+
+
+def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
+  """Convert `input_data_str` according to model and toco parameters.
+
+  Unless you know what you are doing consider using
+  the more friendly @{tf.contrib.lite.toco_convert}}.
+
+  Args:
+    model_flags_str: Serialized proto describing model properties, see
+      `toco/model_flags.proto`.
+    toco_flags_str: Serialized proto describing conversion properties, see
+      `toco/toco_flags.proto`.
+    input_data_str: Input data in serialized form (e.g. a graphdef is common)
+  Returns:
+    Converted model in serialized form (e.g. a TFLITE model is common).
+  Raises:
+    RuntimeError: When conversion fails, an exception is raised with the error
+      message embedded.
+  """
+  # TODO(aselle): When toco does not use fatal errors for failure, we can
+  # switch this on.
+  if not _toco_from_proto_bin:
+    return _toco_python.TocoConvert(
+        model_flags_str, toco_flags_str, input_data_str)
+
+  with _tempfile.NamedTemporaryFile() as fp_toco, \
+           _tempfile.NamedTemporaryFile() as fp_model, \
+           _tempfile.NamedTemporaryFile() as fp_input, \
+           _tempfile.NamedTemporaryFile() as fp_output:
+    fp_model.write(model_flags_str)
+    fp_toco.write(toco_flags_str)
+    fp_input.write(input_data_str)
+    fp_model.flush()
+    fp_toco.flush()
+    fp_input.flush()
+
+    cmd = [
+        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
+        fp_output.name
+    ]
+    cmdline = " ".join(cmd)
+    proc = _subprocess.Popen(
+        cmdline,
+        shell=True,
+        stdout=_subprocess.PIPE,
+        stderr=_subprocess.STDOUT,
+        close_fds=True)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode == 0:
+      stuff = fp_output.read()
+      return stuff
+    else:
+      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
+                         (stdout, stderr))
+
+
+def tensor_name(x):
+  return x.name.split(":")[0]
+
+
+def toco_convert(input_data,
+                 input_tensors,
+                 output_tensors,
+                 inference_type=lite_constants.FLOAT,
+                 input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                 output_format=lite_constants.TFLITE,
+                 quantized_input_stats=None,
+                 drop_control_dependency=True):
+  """Convert a model using TOCO from `input_format` to `output_format`.
+
+  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
+  case the default `input_format` and `output_format` are sufficient.
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`).
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: If the input tensor type is unknown
+    RuntimeError: If TOCO fails to convert (in which case the runtime error's
+      error text will contain the TOCO error log)
+  """
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.drop_control_dependency = drop_control_dependency
+  model = _model_flags_pb2.ModelFlags()
+  toco.inference_type = inference_type
+  for idx, input_tensor in enumerate(input_tensors):
+    if input_tensor.dtype == _dtypes.float32:
+      tflite_input_type = lite_constants.FLOAT
+    elif input_tensor.dtype == _dtypes.int32:
+      tflite_input_type = lite_constants.INT32
+    elif input_tensor.dtype == _dtypes.int64:
+      tflite_input_type = lite_constants.INT64
+    # TODO(aselle): Insert strings when they are available
+    else:
+      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
+                                                         input_tensor.dtype))
+
+    input_array = model.input_arrays.add()
+
+    if inference_type == lite_constants.QUANTIZED_UINT8:
+      if tflite_input_type == lite_constants.FLOAT:
+        tflite_input_type = lite_constants.QUANTIZED_UINT8
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
+
+    input_array.name = tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in output_tensors:
+    model.output_arrays.append(tensor_name(output_tensor))
+
+  # TODO(aselle): Consider handling the case of allowing quantized
+  # inputs to be converted to float (via the toco.inference_input_type field).
+  data = toco_convert_protos(model.SerializeToString(),
+                             toco.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index a2b5ef488ec..a7eddf3408f 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -12,52 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""TensorFlow Lite flatbuffer generation from saved_models.
+"""Functions to convert SavedModel to frozen GraphDefs."""
 
-Example:
-
-bazel run third_party/tensorflow/contrib/lite/python:convert_saved_model -- \
-  --saved_model_dir=/tmp/test_saved_model/1519865537 \
-  --output_tflite=/tmp/test.lite
-
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.toco import model_flags_pb2
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 
-flags.DEFINE_string("saved_model_dir", "", "Saved model directory to convert.")
-flags.DEFINE_string("output_tflite", None, "File path to write flatbuffer.")
-flags.DEFINE_string("output_arrays", None,
-                    "List of output tensor names, the default value is None, "
-                    "which means the conversion will keep all outputs.")
-flags.DEFINE_integer("batch_size", 1,
-                     "If input tensor shape has None at first dimension, "
-                     "e.g. (None,224,224,3), replace None with batch_size.")
-flags.DEFINE_string("tag_set", tag_constants.SERVING,
-                    "Group of tag(s) of the MetaGraphDef in the saved_model, "
-                    "in string format, separated by ','. For tag-set contains "
-                    "multiple tags, all tags must be passed in.")
-flags.DEFINE_string("signature_key",
-                    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-                    "This is signature key to extract inputs, outputs.")
+
+def _write_and_flush_file(file_path, data_str):
+  """Writes data to file path.
+
+  Args:
+    file_path: Full path of the file to store data in.
+    data_str: Data represented as a string.
+
+  Returns: None.
+  """
+  with gfile.Open(file_path, "wb") as data_file:
+    data_file.write(data_str)
+    data_file.flush()
 
 
-def log_tensor_details(tensor_info):
+def _log_tensor_details(tensor_info):
   """Log tensor details: name, shape, and type."""
   for key in tensor_info:
     val = tensor_info[key]
@@ -73,7 +64,7 @@ def log_tensor_details(tensor_info):
                  dtype)
 
 
-def get_meta_graph_def(saved_model_dir, tag_set):
+def _get_meta_graph_def(saved_model_dir, tag_set):
   """Validate saved_model and extract MetaGraphDef.
 
   Args:
@@ -103,7 +94,7 @@ def get_meta_graph_def(saved_model_dir, tag_set):
                      "values are '{}'. ".format(tag_set, tag_sets))
 
 
-def get_signature_def(meta_graph, signature_key):
+def _get_signature_def(meta_graph, signature_key):
   """Get the signature def from meta_graph with given signature_key.
 
   Args:
@@ -130,11 +121,11 @@ def get_signature_def(meta_graph, signature_key):
   return signature_def
 
 
-def get_inputs_outputs(signature_def):
-  """Get inputs and outputs from signature def.
+def _get_inputs_outputs(signature_def):
+  """Get inputs and outputs from SignatureDef.
 
   Args:
-    signature_def: signatuer def in the meta_graph_def for conversion.
+    signature_def: SignatureDef in the meta_graph_def for conversion.
 
   Returns:
     The inputs and outputs in the graph for conversion.
@@ -142,9 +133,9 @@ def get_inputs_outputs(signature_def):
   inputs_tensor_info = signature_def.inputs
   outputs_tensor_info = signature_def.outputs
   logging.info("input tensors info: ")
-  log_tensor_details(inputs_tensor_info)
+  _log_tensor_details(inputs_tensor_info)
   logging.info("output tensors info: ")
-  log_tensor_details(outputs_tensor_info)
+  _log_tensor_details(outputs_tensor_info)
 
   def gather_names(tensor_info):
     return [tensor_info[key].name for key in tensor_info]
@@ -154,109 +145,277 @@ def get_inputs_outputs(signature_def):
   return inputs, outputs
 
 
-def convert(saved_model_dir,
-            output_tflite=None,
-            output_arrays=None,
-            tag_set=None,
-            signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-            batch_size=1):
-  """Convert a saved_model to tflite flatbuffer.
+def _get_tensors(graph, signature_def_tensor_names=None,
+                 user_tensor_names=None):
+  """Gets the tensors associated with the tensor names.
+
+  Either signature_def_tensor_names or user_tensor_names should be provided. If
+  the user provides tensors, the tensors associated with the user provided
+  tensor names are provided. Otherwise, the tensors associated with the names in
+  the SignatureDef are provided.
 
   Args:
-    saved_model_dir: Saved model directory to convert.
-    output_tflite: File path to write result flatbuffer.
-    output_arrays: List of output tensor names, the default value is None, which
-      means conversion keeps all output tensors. This is also used to filter
-      tensors that are from Op currently not supported in tflite, e.g., Argmax).
-    tag_set: This is the set of tags to get meta_graph_def in saved_model.
-    signature_key: This is the signature key to extract inputs, outputs.
-    batch_size: If input tensor shape has None at first dimension,
-      e.g. (None,224,224,3), replace None with batch_size.
+    graph: GraphDef representing graph.
+    signature_def_tensor_names: Tensor names stored in either the inputs or
+      outputs of a SignatureDef. (default None)
+    user_tensor_names: Tensor names provided by the user. (default None)
+
+  Returns:
+    List of tensors.
+
+  Raises:
+    ValueError:
+      signature_def_tensors and user_tensor_names are undefined or empty.
+      user_tensor_names are not valid.
+  """
+  tensors = []
+  if user_tensor_names:
+    # Get the list of all of the tensors with and without the tensor index.
+    all_tensor_names = [
+        tensor.name for op in graph.get_operations() for tensor in op.outputs
+    ]
+    all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names]
+
+    # Sort the tensor names.
+    user_tensor_names = sorted(user_tensor_names)
+
+    # Get the tensors associated with the tensor names.
+    tensors = []
+    invalid_tensors = []
+    for name in user_tensor_names:
+      if name not in all_tensor_names_only:
+        invalid_tensors.append(name)
+      else:
+        idx = all_tensor_names_only.index(name)
+        tensors.append(graph.get_tensor_by_name(all_tensor_names[idx]))
+
+    # Throw ValueError if any user input names are not valid tensors.
+    if invalid_tensors:
+      raise ValueError("Invalid tensors '{}' were found.".format(
+          ",".join(invalid_tensors)))
+  elif signature_def_tensor_names:
+    tensors = [
+        graph.get_tensor_by_name(name)
+        for name in sorted(signature_def_tensor_names)
+    ]
+  else:
+    # Throw ValueError if signature_def_tensors and user_tensor_names are both
+    # either undefined or empty.
+    raise ValueError(
+        "Specify either signature_def_tensor_names or user_tensor_names")
+
+  return tensors
+
+
+def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                        output_arrays, tag_set, signature_key, batch_size):
+  """Converts a SavedModel to a frozen graph.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns:
+    frozen_graph_def: Frozen GraphDef.
+    in_tensors: List of input tensors for the graph.
+    out_tensors: List of output tensors for the graph.
+
+  Raises:
+    ValueError:
+      SavedModel doesn't contain a MetaGraphDef identified by tag_set.
+      signature_key is not in the MetaGraphDef.
+      input_shapes does not match the length of input_arrays.
+      input_shapes has a None value after the 1st dimension.
+      input_arrays or output_arrays are not valid.
+      Unable to load Session.
+  """
+  # Set default values for inputs if they are set to None.
+  if signature_key is None:
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+  if tag_set is None:
+    tag_set = set([tag_constants.SERVING])
+  if batch_size is None:
+    batch_size = 1
+
+  # Read SignatureDef.
+  meta_graph = _get_meta_graph_def(saved_model_dir, tag_set)
+  signature_def = _get_signature_def(meta_graph, signature_key)
+  inputs, outputs = _get_inputs_outputs(signature_def)
+
+  graph = ops.Graph()
+  with session.Session(graph=graph) as sess:
+    # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory.
+    loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
+
+    # Gets input and output tensors.
+    # TODO(zhixianyan): Use TFLite supported Op list to filter outputs.
+    in_tensors = _get_tensors(graph, inputs, input_arrays)
+    out_tensors = _get_tensors(graph, outputs, output_arrays)
+
+    # Gets fully defined tensor shape. An input tensor with None in the first
+    # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size.
+    # Shapes with None after the first dimension result in a ValueError.
+    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
+    for tensor in in_tensors:
+      if (input_shapes and tensor.name in input_shapes and
+          input_shapes[tensor.name] is not None):
+        shape = input_shapes[tensor.name]
+      else:
+        shape = tensor.get_shape().as_list()
+
+      if None in shape[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(tensor.name, shape))
+      elif shape[0] is None:
+        shape[0] = batch_size
+      tensor.set_shape(shape)
+
+    output_names = [node.split(":")[0] for node in outputs]
+    frozen_graph_def = tf_graph_util.convert_variables_to_constants(
+        sess, graph.as_graph_def(), output_names)
+
+    return frozen_graph_def, in_tensors, out_tensors
+  raise ValueError("Unable to load Session.")
+
+
+def saved_model_to_frozen_graphdef(
+    saved_model_dir,
+    output_file_model,
+    output_file_flags,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1):
+  """Converts a SavedModel to a frozen graph. Writes graph to tmp directory.
+
+  Stores frozen graph and command line flags in the tmp directory.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file_model: Full file path to save frozen graph.
+    output_file_flags: Full file path to save ModelFlags.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+
+  Returns: None.
+
+  Raises:
+    ValueError: Unable to convert to frozen graph.
+  """
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
+
+  # Initialize model flags.
+  model = model_flags_pb2.ModelFlags()
+
+  for input_tensor in in_tensors:
+    input_array = model.input_arrays.add()
+    input_array.name = convert.tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+
+  for output_tensor in out_tensors:
+    model.output_arrays.append(convert.tensor_name(output_tensor))
+
+  # Write model and ModelFlags to file. ModelFlags contain input array and
+  # output array information that is parsed from the SignatureDef and used for
+  # analysis by TOCO.
+  _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString())
+  _write_and_flush_file(output_file_flags, model.SerializeToString())
+
+
+def tflite_from_saved_model(
+    saved_model_dir,
+    output_file=None,
+    input_arrays=None,
+    input_shapes=None,
+    output_arrays=None,
+    tag_set=None,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    batch_size=1,
+    inference_type=lite_constants.FLOAT,
+    input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+    output_format=lite_constants.TFLITE,
+    quantized_input_stats=None,
+    drop_control_dependency=True):
+  """Converts a SavedModel to TFLite FlatBuffer.
+
+  Args:
+    saved_model_dir: SavedModel directory to convert.
+    output_file: File path to write result TFLite FlatBuffer.
+    input_arrays: List of input tensors to freeze graph with. Uses input arrays
+      from SignatureDef when none are provided. (default None)
+    input_shapes: Map of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+      (default None)
+    output_arrays: List of output tensors to freeze graph with. Uses output
+      arrays from SignatureDef when none are provided. (default None)
+    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+      analyze. All tags in the tag set must be present. (default "serve")
+    signature_key: Key identifying SignatureDef containing inputs and outputs.
+    batch_size: Batch size for the model. Replaces the first dimension of an
+      input size array if undefined. (default 1)
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
 
   Returns:
     The converted data. For example if tflite was the destination, then
     this will be a tflite flatbuffer in a bytes array.
 
   Raises:
-    ValueError: If tag_set does not indicate any meta_graph_def in saved_model,
-      or signature_key is not in relevant meta_graph_def,
-      or input shape has None beyond 1st dimension, e.g., (1,None, None, 3),
-      or given output_arrays are not valid causing empty outputs.
+    ValueError: Unable to convert to frozen graph.
   """
-  if tag_set is None:
-    tag_set = set([tag_constants.SERVING])
+  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
+      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
+      signature_key, batch_size)
 
-  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
-  signature_def = get_signature_def(meta_graph, signature_key)
-  inputs, outputs = get_inputs_outputs(signature_def)
+  result = convert.toco_convert(
+      input_data=frozen_graph_def,
+      input_tensors=in_tensors,
+      output_tensors=out_tensors,
+      inference_type=inference_type,
+      input_format=input_format,
+      output_format=output_format,
+      quantized_input_stats=quantized_input_stats,
+      drop_control_dependency=drop_control_dependency)
 
-  graph = ops.Graph()
-  with session.Session(graph=graph) as sess:
+  if output_file is not None:
+    with gfile.Open(output_file, "wb") as f:
+      f.write(result)
+    logging.info("Successfully converted to: %s", output_file)
 
-    loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
-
-    in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs]
-
-    # Users can use output_arrays to filter output tensors for conversion.
-    # If output_arrays is None, we keep all output tensors. In future, we may
-    # use tflite supported Op list and check whether op is custom Op to
-    # automatically filter output arrays.
-    # TODO(zhixianyan): Use tflite supported Op list to filter outputs.
-    if output_arrays is not None:
-      output_arrays = output_arrays.split(",")
-      out_tensors = [
-          graph.get_tensor_by_name(output)
-          for output in outputs
-          if output.split(":")[0] in output_arrays
-      ]
-    else:
-      out_tensors = [graph.get_tensor_by_name(output) for output in outputs]
-
-    output_names = [node.split(":")[0] for node in outputs]
-
-    if not out_tensors:
-      raise ValueError(
-          "No valid output tensors for '{}', possible values are '{}'".format(
-              output_arrays, output_names))
-
-    frozen_graph_def = tf_graph_util.convert_variables_to_constants(
-        sess, graph.as_graph_def(), output_names)
-
-    # Toco requires fully defined tensor shape, for input tensor with None in
-    # their shape, e.g., (None, 224, 224, 3), we need to replace first None with
-    # a given batch size. For shape with more None, e.g. (None, None, None, 3),
-    # still be able to replace and convert, but require further investigation.
-    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
-    for i in range(len(in_tensors)):
-      shape = in_tensors[i].get_shape().as_list()
-      if shape[0] is None:
-        shape[0] = batch_size
-      if None in shape[1:]:
-        raise ValueError(
-            "Only support None shape at 1st dim as batch_size. But tensor "
-            "'{}' 's shape '{}' has None at other dimension. ".format(
-                inputs[i], shape))
-      in_tensors[i].set_shape(shape)
-
-    result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors)
-
-    if output_tflite is not None:
-      with gfile.Open(output_tflite, "wb") as f:
-        f.write(result)
-      logging.info("Successfully converted to: %s", output_tflite)
-
-    return result
-
-
-def main(_):
-  convert(
-      saved_model_dir=flags.FLAGS.saved_model_dir,
-      output_tflite=flags.FLAGS.output_tflite,
-      output_arrays=flags.FLAGS.output_arrays,
-      batch_size=flags.FLAGS.batch_size,
-      tag_set=set(flags.FLAGS.tag_set.split(",")),
-      signature_key=flags.FLAGS.signature_key)
-
-
-if __name__ == "__main__":
-  app.run(main)
+  return result
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index 734e42d619b..db95fc8ad7f 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TF Lite SavedModel Conversion test cases.
-
- - test on generated saved_models from simple graphs (sanity check)
- - test mnist savedmodel generated on-the-fly
+"""TFLite SavedModel conversion test cases.
 
+  - Tests converting simple SavedModel graph to TFLite FlatBuffer.
+  - Tests converting simple SavedModel graph to frozen graph.
+  - Tests converting MNIST SavedModel to TFLite FlatBuffer.
 """
 
 from __future__ import absolute_import
@@ -25,6 +25,7 @@ from __future__ import print_function
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib as estimator
@@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.training import training as train
@@ -45,7 +47,7 @@ from tensorflow.python.training import training as train
 class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
 
   def _createSimpleSavedModel(self, shape):
-    """Create a simple savedmodel on the fly."""
+    """Create a simple SavedModel on the fly."""
     saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
     with session.Session() as sess:
       in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
@@ -56,44 +58,78 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
     return saved_model_dir
 
   def testSimpleSavedModel(self):
-    """Test a simple savedmodel created on the fly."""
-    # Create a simple savedmodel
+    """Test a simple SavedModel created on the fly."""
+    # Create a simple SavedModel
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite
-    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
     self.assertTrue(result)
 
   def testSimpleSavedModelWithNoneBatchSizeInShape(self):
-    """Test a simple savedmodel, with None in input tensor's shape."""
+    """Test a simple SavedModel, with None in input tensor's shape."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    result = convert_saved_model.convert(saved_model_dir=saved_model_dir)
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir)
     self.assertTrue(result)
 
   def testSimpleSavedModelWithMoreNoneInShape(self):
-    """Test a simple savedmodel, fail as more None in input shape."""
+    """Test a simple SavedModel, fail as more None in input shape."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3])
     # Convert to tflite: this should raise ValueError, as 3rd dim is None.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(saved_model_dir=saved_model_dir)
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir)
 
   def testSimpleSavedModelWithWrongSignatureKey(self):
-    """Test a simple savedmodel, fail as given signature is invalid."""
+    """Test a simple SavedModel, fail as given signature is invalid."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite: this should raise ValueError, as
     # signature_key does not exit in the saved_model.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(
+      convert_saved_model.tflite_from_saved_model(
           saved_model_dir=saved_model_dir, signature_key="wrong-key")
 
   def testSimpleSavedModelWithWrongOutputArray(self):
-    """Test a simple savedmodel, fail as given output_arrays is invalid."""
-    # Create a simple savedmodel
+    """Test a simple SavedModel, fail as given output_arrays is invalid."""
+    # Create a simple SavedModel
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
     # Convert to tflite: this should raise ValueError, as
     # output_arrays is not valid for the saved_model.
     with self.assertRaises(ValueError):
-      convert_saved_model.convert(
-          saved_model_dir=saved_model_dir, output_arrays="wrong-output")
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, output_arrays=["wrong-output"])
+
+  def testSimpleSavedModelWithWrongInputArrays(self):
+    """Test a simple SavedModel, fail as given input_arrays is invalid."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    # Checks invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir, input_arrays=["wrong-input"])
+    # Checks valid and invalid input_arrays.
+    with self.assertRaises(ValueError):
+      convert_saved_model.tflite_from_saved_model(
+          saved_model_dir=saved_model_dir,
+          input_arrays=["Placeholder", "wrong-input"])
+
+  def testSimpleSavedModelWithCorrectArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and output_arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        output_arrays=["add"])
+    self.assertTrue(result)
+
+  def testSimpleSavedModelWithCorrectInputArrays(self):
+    """Test a simple SavedModel, with correct input_arrays and input_shapes."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    result = convert_saved_model.tflite_from_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        input_shapes={"Placeholder": [1, 16, 16, 3]})
+    self.assertTrue(result)
 
   def testMultipleMetaGraphDef(self):
     """Test saved model with multiple MetaGraphDef."""
@@ -119,20 +155,103 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
           sess,
           tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
           signature_def_map=signature_def_map)
+
       # MetaGraphDef 2
       builder.add_meta_graph(tags=["tflite"])
       builder.save(True)
 
     # Convert to tflite
-    convert_saved_model.convert(
+    convert_saved_model.tflite_from_saved_model(
         saved_model_dir=saved_model_dir,
         tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
 
 
+class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputB")
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputA")
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {"x": in_tensor_1, "y": in_tensor_2}
+      outputs = {"z": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _getInputArrayNames(self, model_proto):
+    return [data.name for data in model_proto.input_arrays]
+
+  def _getInputArrayShapes(self, model_proto):
+    return [
+        [dim for dim in data.shape.dims] for data in model_proto.input_arrays
+    ]
+
+  def _get_model_flags_proto_from_file(self, filename):
+    proto = _model_flags_pb2.ModelFlags()
+    with gfile.Open(filename, "rb") as output_file:
+      proto.ParseFromString(output_file.read())
+      output_file.close()
+    return proto
+
+  def testSimpleSavedModel(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputB", "inputA"])
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"])
+    self.assertEqual(
+        self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]])
+
+  def testSimpleSavedModelWithDifferentInputNames(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
+    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
+
+    # Check case where input shape is given.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": [1, 16, 16, 3]})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+    # Check case where input shape is None.
+    convert_saved_model.saved_model_to_frozen_graphdef(
+        saved_model_dir=saved_model_dir,
+        output_file_model=output_file_model,
+        output_file_flags=output_file_flags,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": None})
+
+    proto = self._get_model_flags_proto_from_file(output_file_flags)
+    self.assertEqual(proto.output_arrays, ["add"])
+    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
+    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+
+
 class Model(keras.Model):
   """Model to recognize digits in the MNIST dataset.
 
-  Train and export savedmodel, used for testOnflyTrainMnistSavedModel
+  Train and export SavedModel, used for testOnflyTrainMnistSavedModel
 
   Network structure is equivalent to:
   https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -238,7 +357,7 @@ def dummy_input_fn():
 class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
 
   def testTrainedMnistSavedModel(self):
-    """Test mnist savedmodel, trained with dummy data and small steps."""
+    """Test mnist SavedModel, trained with dummy data and small steps."""
     # Build classifier
     classifier = estimator.Estimator(
         model_fn=model_fn,
@@ -253,21 +372,20 @@ class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
         "image": image,
     })
 
-    # Export savedmodel
+    # Export SavedModel
     saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel")
     classifier.export_savedmodel(saved_model_dir, pred_input_fn)
 
     # Convert to tflite and test output
     saved_model_name = os.listdir(saved_model_dir)[0]
     saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
-    output_tflite = os.path.join(saved_model_dir,
-                                 saved_model_final_dir + ".lite")
+    output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite")
     # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
     # once b/74205001 fixed and argmax implemented in tflite.
-    result = convert_saved_model.convert(
+    result = convert_saved_model.tflite_from_saved_model(
         saved_model_dir=saved_model_final_dir,
-        output_arrays="Softmax",
-        output_tflite=output_tflite)
+        output_arrays=["Softmax"],
+        output_file=output_file)
 
     self.assertTrue(result)
 
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
new file mode 100644
index 00000000000..4d9782f4a6a
--- /dev/null
+++ b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python console command for generating frozen models from SavedModels.
+
+This exists to add SavedModel compatibility to TOCO.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef
+from tensorflow.python.platform import app
+
+FLAGS = None
+
+
+def execute(unused_args):
+  """Calls function to convert the SavedModel to a frozen graph."""
+  # Error handling.
+  if FLAGS.input_shapes and not FLAGS.input_arrays:
+    raise ValueError("Input shapes requires input arrays to be specified.")
+
+  # Calls saved_model_to_frozen_graphdef function to generate frozen graph.
+  input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None)
+  input_shapes = None
+  if FLAGS.input_shapes:
+    input_shapes = {
+        input_arrays[idx]: shape.split(",")
+        for idx, shape in enumerate(FLAGS.input_shapes.split(":"))
+    }
+  output_arrays = (
+      FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None)
+  tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None
+
+  saved_model_to_frozen_graphdef(
+      saved_model_dir=FLAGS.saved_model_directory,
+      output_file_model=FLAGS.output_file_model,
+      output_file_flags=FLAGS.output_file_flags,
+      input_arrays=input_arrays,
+      input_shapes=input_shapes,
+      output_arrays=output_arrays,
+      tag_set=tag_set,
+      signature_key=FLAGS.signature_key,
+      batch_size=FLAGS.batch_size)
+
+
+def main():
+  global FLAGS
+  # Parses flags.
+  parser = argparse.ArgumentParser(
+      description="Invoke SavedModel to frozen model converter.")
+  parser.add_argument(
+      "saved_model_directory",
+      type=str,
+      help="Full path to directory containing the SavedModel.")
+  parser.add_argument(
+      "output_file_model",
+      type=str,
+      help="Full file path to save frozen graph.")
+  parser.add_argument(
+      "output_file_flags", type=str, help="Full file path to save ModelFlags.")
+  parser.add_argument(
+      "--input_arrays",
+      type=str,
+      help="Name of the input arrays, comma-separated.")
+  parser.add_argument(
+      "--input_shapes",
+      type=str,
+      help="Shapes corresponding to --input_arrays, colon-separated.")
+  parser.add_argument(
+      "--output_arrays",
+      type=str,
+      help="Name of the output arrays, comma-separated.")
+  parser.add_argument(
+      "--tag_set", type=str, help="Name of output arrays, comma-separated.")
+  parser.add_argument(
+      "--signature_key",
+      type=str,
+      help="Key identifying SignatureDef containing inputs and outputs.")
+  parser.add_argument(
+      "--batch_size",
+      type=int,
+      help="Batch size for the model. Replaces the first dimension of an "
+      "input size array if undefined.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=execute, argv=[sys.argv[0]] + unparsed)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/convert_test.py
similarity index 82%
rename from tensorflow/contrib/lite/python/lite_test.py
rename to tensorflow/contrib/lite/python/convert_test.py
index b8b4510188b..dc21a9b6693 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/convert_test.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
+from tensorflow.contrib.lite.python import convert
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.python import op_hint
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -29,7 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class LiteTest(test_util.TensorFlowTestCase):
+class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -37,13 +38,13 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
     # Try running on valid graph
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
+    result = convert.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
     # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
     # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
-    #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
+    #   result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
 
   def testQuantization(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
@@ -51,13 +52,14 @@ class LiteTest(test_util.TensorFlowTestCase):
     out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor,
                                                         min=0., max=1.)
     sess = session.Session()
-    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor],
-                               inference_type=lite.QUANTIZED_UINT8,
-                               quantized_input_stats=[(0., 1.)])
+    result = convert.toco_convert(
+        sess.graph_def, [in_tensor], [out_tensor],
+        inference_type=lite_constants.QUANTIZED_UINT8,
+        quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
 
-class LiteTestOpHint(test_util.TensorFlowTestCase):
+class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
   def _getGraphOpTypes(self, graphdef, output_nodes):
@@ -99,7 +101,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     swish_scale = array_ops.constant(1.0)
 
     def _swish(input_tensor, scale):
-      custom = lite.OpHint("cool_activation")
+      custom = op_hint.OpHint("cool_activation")
       input_tensor, scale = custom.add_inputs(input_tensor, scale)
       output = math_ops.sigmoid(input_tensor) * input_tensor * scale
       output, = custom.add_outputs(output)
@@ -111,11 +113,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # and 1 final output).
       self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["cool_activation", "Const", "Identity"])
 
   def testScaleAndBiasAndIdentity(self):
@@ -125,7 +128,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     b = array_ops.constant([4., 5.])
 
     def _scaled_and_bias_and_identity(a, x, b):
-      custom = lite.OpHint("scale_and_bias_and_identity")
+      custom = op_hint.OpHint("scale_and_bias_and_identity")
       a, x, b = custom.add_inputs(a, x, b)
       return custom.add_outputs(a * x + b, x)
     output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
@@ -136,11 +139,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
 
   def testTwoFunctions(self):
@@ -148,7 +152,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
     a = array_ops.constant([1.])
     b = array_ops.constant([1.])
     def _double_values(x):
-      custom = lite.OpHint("add_test")
+      custom = op_hint.OpHint("add_test")
       x = custom.add_inputs(x)
       output = math_ops.multiply(x, x)
       output, = custom.add_outputs(output)
@@ -160,10 +164,11 @@ class LiteTestOpHint(test_util.TensorFlowTestCase):
       # make sure one identity for each input (2) and output (2) => 2 + 2
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
       self.assertCountEqual(
           self._getGraphOpTypes(
-              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output)]),
           ["add_test", "Const", "Identity", "Add"])
 
 
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index cf50f9d4d65..4ea40201f73 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,6 +18,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@tflite_from_saved_model
 @@OpHint
 @@convert_op_hints_to_stubs
 
@@ -25,208 +26,11 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os as _os
-import subprocess as _subprocess
-import tempfile as _tempfile
 
 # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.convert import toco_convert
+from tensorflow.contrib.lite.python.convert import toco_convert_protos
+from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.contrib.lite.python.op_hint import OpHint
 # pylint: enable=unused-import
-from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.platform import resource_loader as _resource_loader
-from tensorflow.python.util.all_util import remove_undocumented
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies.
-_toco_python = LazyLoader(
-    "tensorflow_wrap_toco", globals(),
-    "tensorflow.contrib.lite.toco.python."
-    "tensorflow_wrap_toco")
-del LazyLoader
-
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
-TFLITE = _toco_flags_pb2.TFLITE
-GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
-
-# Currently the default mode of operation is to shell to another python process
-# to protect against crashes. However, it breaks some dependent targets because
-# it forces us to depend on an external py_binary. The experimental API doesn't
-# have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
-
-# Find the toco_from_protos binary using the resource loader if using from
-# bazel, otherwise we are in a pip where console_scripts already has
-# the toco_from_protos tool.
-if EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
-  _toco_from_proto_bin = ""
-else:
-  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
-      "../toco/python/toco_from_protos")
-
-if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
-  _toco_from_proto_bin = "toco_from_protos"
-
-
-def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
-  """Convert `input_data_str` according to model and toco parameters.
-
-  Unless you know what you are doing consider using
-  the more friendly @{tf.contrib.lite.toco_convert}}.
-
-  Args:
-    model_flags_str: Serialized proto describing model properties, see
-      `toco/model_flags.proto`.
-    toco_flags_str: Serialized proto describing conversion properties, see
-      `toco/toco_flags.proto`.
-    input_data_str: Input data in serialized form (e.g. a graphdef is common)
-  Returns:
-    Converted model in serialized form (e.g. a TFLITE model is common).
-  Raises:
-    RuntimeError: When conversion fails, an exception is raised with the error
-      message embedded.
-  """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
-    return _toco_python.TocoConvert(
-        model_flags_str, toco_flags_str, input_data_str)
-
-  with _tempfile.NamedTemporaryFile() as fp_toco, \
-           _tempfile.NamedTemporaryFile() as fp_model, \
-           _tempfile.NamedTemporaryFile() as fp_input, \
-           _tempfile.NamedTemporaryFile() as fp_output:
-    fp_model.write(model_flags_str)
-    fp_toco.write(toco_flags_str)
-    fp_input.write(input_data_str)
-    fp_model.flush()
-    fp_toco.flush()
-    fp_input.flush()
-
-    cmd = [
-        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
-        fp_output.name
-    ]
-    cmdline = " ".join(cmd)
-    proc = _subprocess.Popen(
-        cmdline,
-        shell=True,
-        stdout=_subprocess.PIPE,
-        stderr=_subprocess.STDOUT,
-        close_fds=True)
-    stdout, stderr = proc.communicate()
-    exitcode = proc.returncode
-    if exitcode == 0:
-      stuff = fp_output.read()
-      return stuff
-    else:
-      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
-                         (stdout, stderr))
-
-
-def _tensor_name(x):
-  return x.name.split(":")[0]
-
-
-def toco_convert(input_data,
-                 input_tensors,
-                 output_tensors,
-                 inference_type=FLOAT,
-                 input_format=TENSORFLOW_GRAPHDEF,
-                 output_format=TFLITE,
-                 quantized_input_stats=None,
-                 drop_control_dependency=True,
-                 allow_custom_ops=None):
-  """Convert a model using TOCO from `input_format` to `output_format`.
-
-  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
-  case the default `input_format` and `output_format` are sufficient.
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`).
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
-
-  Returns:
-    The converted data. For example if tflite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    ValueError: If the input tensor type is unknown
-    RuntimeError: If TOCO fails to convert (in which case the runtime error's
-      error text will contain the TOCO error log)
-  """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.inference_type = inference_type
-  toco.drop_control_dependency = drop_control_dependency
-  if allow_custom_ops is not None:
-    toco.allow_custom_ops = allow_custom_ops
-
-  model = _model_flags_pb2.ModelFlags()
-  for idx, input_tensor in enumerate(input_tensors):
-    if input_tensor.dtype == _dtypes.float32:
-      tflite_input_type = FLOAT
-    elif input_tensor.dtype == _dtypes.int32:
-      tflite_input_type = INT32
-    elif input_tensor.dtype == _dtypes.int64:
-      tflite_input_type = INT64
-    # TODO(aselle): Insert strings when they are available
-    else:
-      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
-                                                         input_tensor.dtype))
-
-    input_array = model.input_arrays.add()
-
-    if inference_type == QUANTIZED_UINT8:
-      if tflite_input_type == FLOAT:
-        tflite_input_type = QUANTIZED_UINT8
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
-
-    input_array.name = _tensor_name(input_tensor)
-    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
-
-  for output_tensor in output_tensors:
-    model.output_arrays.append(_tensor_name(output_tensor))
-
-  # TODO(aselle): Consider handling the case of allowing quantized
-  # inputs to be converted to float (via the toco.inference_input_type field).
-  data = toco_convert_protos(model.SerializeToString(),
-                             toco.SerializeToString(),
-                             input_data.SerializeToString())
-  return data
-
-
-_allowed_symbols = [
-    "FLOAT",
-    "INT32",
-    "INT64",
-    "STRING",
-    "QUANTIZED_UINT8",
-    "TENSORFLOW_GRAPHDEF",
-    "TFLITE",
-    "GRAPHVIZ_DOT",
-    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py
new file mode 100644
index 00000000000..195d7a732f3
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite_constants.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants for TFLite."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.util.all_util import remove_undocumented
+
+# Enum types from the protobuf promoted to the API
+FLOAT = _types_pb2.FLOAT
+INT32 = _types_pb2.INT32
+INT64 = _types_pb2.INT64
+STRING = _types_pb2.STRING
+QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
+TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
+TFLITE = _toco_flags_pb2.TFLITE
+GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
+
+# Currently the default mode of operation is to shell to another python process
+# to protect against crashes. However, it breaks some dependent targets because
+# it forces us to depend on an external py_binary. The experimental API doesn't
+# have that drawback.
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
+
+
+_allowed_symbols = [
+    "FLOAT",
+    "INT32",
+    "INT64",
+    "STRING",
+    "QUANTIZED_UINT8",
+    "TENSORFLOW_GRAPHDEF",
+    "TFLITE",
+    "GRAPHVIZ_DOT",
+    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
+]
+remove_undocumented(__name__, _allowed_symbols)

From ecd837fd0ab69cf54d920eae3b1c73602be6c626 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" 
Date: Mon, 23 Apr 2018 17:14:16 -0700
Subject: [PATCH 0640/1734] [TF:XLA] Add a kernel for PlaceholderWithDefault

PiperOrigin-RevId: 194010395
---
 tensorflow/compiler/tests/BUILD               | 12 +++++
 tensorflow/compiler/tests/placeholder_test.py | 48 +++++++++++++++++++
 .../compiler/tf2xla/kernels/identity_op.cc    |  1 +
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/compiler/tests/placeholder_test.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index ac2441cea0f..0c720932568 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -923,3 +923,15 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
+
+tf_xla_py_test(
+    name = "placeholder_test",
+    size = "small",
+    srcs = ["placeholder_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
new file mode 100644
index 00000000000..5e6d1313bd0
--- /dev/null
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for xla handling of placeholder_with_default."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PlaceholderTest(XLATestCase):
+
+  def test_placeholder_with_default_default(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(8.0, sess.run(out))
+
+  def test_placeholder_with_default_fed(self):
+    with self.test_session() as sess, self.test_scope():
+      v = resource_variable_ops.ResourceVariable(4.0)
+      ph = array_ops.placeholder_with_default(v, shape=[])
+      out = ph * 2
+      sess.run(variables.variables_initializer([v]))
+      self.assertEqual(2.0, sess.run(out, {ph: 1.0}))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 39af662b638..e72200bfbcf 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -38,6 +38,7 @@ class IdentityOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp);
 
 REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp);
+REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("Snapshot"), IdentityOp);

From 80fc661853f9a0844faf95eb68438dc85a5879e3 Mon Sep 17 00:00:00 2001
From: Justin Lebar 
Date: Mon, 23 Apr 2018 17:16:55 -0700
Subject: [PATCH 0641/1734] Use tensorflow::se instead of perftools::gputools
 for StreamExecutor.

PiperOrigin-RevId: 194010749
---
 tensorflow/compiler/aot/compile.cc            |  5 +-
 .../compiler/jit/kernels/xla_launch_op.cc     | 12 ++--
 .../compiler/jit/kernels/xla_launch_op.h      |  2 +-
 .../compiler/jit/xla_compile_on_demand_op.cc  |  2 +-
 tensorflow/compiler/jit/xla_device.cc         |  2 -
 tensorflow/compiler/jit/xla_device.h          | 13 ++--
 tensorflow/compiler/jit/xla_device_context.cc |  2 -
 tensorflow/compiler/jit/xla_device_context.h  | 15 ++---
 tensorflow/compiler/jit/xla_launch_util.cc    | 26 ++++----
 tensorflow/compiler/jit/xla_launch_util.h     | 13 ++--
 tensorflow/compiler/jit/xla_tensor.cc         |  9 ++-
 tensorflow/compiler/jit/xla_tensor.h          |  3 +-
 .../fused_conv2d_bias_activation_op.cc        |  2 +-
 .../kernels/adjust_hsv_in_yiq_op_gpu.cu.cc    |  2 +-
 .../mpi_collectives/kernels/mpi_ops.cc        |  2 +-
 tensorflow/contrib/mpi_collectives/mpi_ops.cc |  2 +-
 .../contrib/nccl/kernels/nccl_manager.cc      | 56 ++++++++---------
 .../contrib/nccl/kernels/nccl_manager.h       | 38 +++++-------
 .../contrib/nccl/kernels/nccl_manager_test.cc |  8 +--
 tensorflow/contrib/rnn/kernels/blas_gemm.cc   | 11 ++--
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  1 -
 .../common_runtime/gpu/gpu_bfc_allocator.h    |  8 +--
 .../gpu/gpu_cudamalloc_allocator.h            |  2 +-
 .../common_runtime/gpu/gpu_debug_allocator.cc |  6 +-
 .../common_runtime/gpu/gpu_debug_allocator.h  |  4 +-
 .../core/common_runtime/gpu/gpu_device.cc     |  5 +-
 .../core/common_runtime/gpu/gpu_event_mgr.cc  | 22 +++----
 .../core/common_runtime/gpu/gpu_event_mgr.h   | 30 ++++-----
 .../common_runtime/gpu/gpu_event_mgr_test.cc  | 19 +++---
 .../core/common_runtime/gpu/gpu_init.cc       |  8 +--
 .../core/common_runtime/gpu/gpu_util.cc       | 20 +++---
 tensorflow/core/common_runtime/gpu/gpu_util.h |  5 +-
 .../core/common_runtime/gpu/pool_allocator.h  |  4 +-
 .../common_runtime/gpu/pool_allocator_test.cc | 32 +++++-----
 .../core/common_runtime/gpu_device_context.h  |  4 +-
 tensorflow/core/grappler/devices.cc           | 12 ++--
 tensorflow/core/kernels/avgpooling_op.cc      | 24 +++----
 .../core/kernels/batch_matmul_op_impl.h       | 44 ++++++-------
 tensorflow/core/kernels/bias_op.cc            |  4 +-
 tensorflow/core/kernels/check_numerics_op.cc  |  6 +-
 .../core/kernels/conv_grad_filter_ops.cc      | 32 +++++-----
 .../core/kernels/conv_grad_input_ops.cc       | 28 ++++-----
 tensorflow/core/kernels/conv_grad_ops_3d.cc   | 62 +++++++++----------
 tensorflow/core/kernels/conv_ops.cc           | 24 +++----
 tensorflow/core/kernels/conv_ops_3d.cc        | 26 ++++----
 tensorflow/core/kernels/conv_ops_gpu.h        | 26 ++++----
 tensorflow/core/kernels/crop_and_resize_op.cc |  8 +--
 tensorflow/core/kernels/cuda_device_array.h   |  2 +-
 tensorflow/core/kernels/cuda_solvers.cc       |  6 +-
 tensorflow/core/kernels/cuda_solvers.h        |  2 +-
 tensorflow/core/kernels/cudnn_pooling_gpu.cc  | 42 ++++++-------
 tensorflow/core/kernels/cudnn_pooling_gpu.h   |  4 +-
 tensorflow/core/kernels/cudnn_rnn_ops.cc      | 52 ++++++++--------
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  |  3 +-
 .../kernels/dynamic_partition_op_gpu.cu.cc    |  4 +-
 tensorflow/core/kernels/fft_ops.cc            | 33 +++++-----
 .../core/kernels/fused_batch_norm_op.cc       | 22 +++----
 tensorflow/core/kernels/gpu_utils.h           |  8 +--
 tensorflow/core/kernels/lrn_op.cc             | 12 ++--
 tensorflow/core/kernels/matmul_op.cc          | 51 +++++++--------
 .../kernels/matrix_triangular_solve_op.cc     | 31 +++++-----
 tensorflow/core/kernels/maxpooling_op.cc      | 20 +++---
 tensorflow/core/kernels/pooling_ops_3d.cc     | 23 +++----
 tensorflow/core/kernels/pooling_ops_common.cc | 46 +++++++-------
 .../core/kernels/pooling_ops_common_gpu.h     |  4 +-
 .../core/kernels/segment_reduction_ops.cc     |  4 +-
 tensorflow/core/kernels/where_op.cc           |  5 +-
 .../platform/default/gpu/cupti_wrapper.cc     | 42 ++++++-------
 tensorflow/core/platform/types.h              |  4 +-
 69 files changed, 510 insertions(+), 601 deletions(-)

diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 7c833878818..e17a7c4bf67 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -88,9 +88,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Converts the graph into an XLA computation, and compiles the
   // computation.
   // TODO(toddw): Should we let the user pick the XLA cpu vs. gpu client?
-  namespace gpu = perftools::gputools;
-  gpu::Platform* cpu_platform =
-      gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
+  se::Platform* cpu_platform =
+      se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
   xla::CompileOnlyClient* client =
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index f48941fce32..03ae09ee8be 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -37,8 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = perftools::gputools;
-
 namespace tensorflow {
 
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
@@ -51,9 +49,9 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   num_constant_args_ = constant_types.size();
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
   if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id_ = gpu::host::kHostPlatformId;
+    platform_id_ = se::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id_ = gpu::cuda::kCudaPlatformId;
+    platform_id_ = se::cuda::kCudaPlatformId;
   } else {
     platform_id_ = nullptr;
   }
@@ -69,7 +67,7 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
     return Status::OK();
   }
 
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
+  auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
     return StreamExecutorUtil::ConvertStatus(platform.status());
   }
@@ -100,7 +98,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
 
-  gpu::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   XlaCompilationCache* cache;
@@ -153,7 +151,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
-  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
   // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
   // is restricted to Variables, but we need something like this to apply to
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index c6cc0986af0..8f8e646f0ff 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -53,7 +53,7 @@ class XlaLocalLaunchOp : public OpKernel {
   // Number of resource variable arguments.
   int num_resource_args_;
 
-  perftools::gputools::Platform::Id platform_id_;
+  se::Platform::Id platform_id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 6c2782e28e9..60458f6f331 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -58,7 +58,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 
   launch_context.PopulateInputs(ctx, result, variables);
 
-  perftools::gputools::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   TF_RET_CHECK(stream);
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2c2ac839b38..7beb18c04d6 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -51,8 +51,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // Caches a XlaDeviceAllocator per  pair. A
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 2f5c53aea88..3ae87308cc7 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -49,20 +49,20 @@ class XlaDevice : public LocalDevice {
   // retrieved e.g., when lazily creating the XlaCompilationCache device.
   class Metadata {
    public:
-    Metadata(int device_ordinal, perftools::gputools::Platform* platform,
+    Metadata(int device_ordinal, se::Platform* platform,
              const DeviceType& device_type);
 
     // The index of the device on this host.
     int device_ordinal() const;
 
-    perftools::gputools::Platform* platform() const;
+    se::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
-    perftools::gputools::Platform* platform_;  // Not owned.
+    se::Platform* platform_;  // Not owned.
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -85,8 +85,7 @@ class XlaDevice : public LocalDevice {
 
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            ::perftools::gputools::Platform* platform,
-            bool transfer_as_literal);
+            se::Platform* platform, bool transfer_as_literal);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -103,7 +102,7 @@ class XlaDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   xla::LocalClient* client() const;
-  xla::StatusOr<::perftools::gputools::Stream*> GetStream();
+  xla::StatusOr GetStream();
 
   // If not already set, create and set GpuDeviceInfo.
   // Not thread-safe
@@ -118,7 +117,7 @@ class XlaDevice : public LocalDevice {
   DeviceType jit_device_name_;
   // Memory allocator associated with this device.
   Allocator* xla_allocator_;                   // Not owned.
-  ::perftools::gputools::Platform* platform_;  // Not owned.
+  se::Platform* platform_;                     // Not owned.
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 43eb1640126..bf8c1886a02 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
-namespace se = ::perftools::gputools;
-
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index ad914a1c23b..d7f5f1d2089 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -45,8 +45,7 @@ class XlaDeviceAllocator : public Allocator {
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(perftools::gputools::Stream* stream,
-                              xla::LocalClient* client,
+  explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client,
                               bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
@@ -54,7 +53,7 @@ class XlaTransferManager {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done);
-  perftools::gputools::Stream* stream() const { return stream_; }
+  se::Stream* stream() const { return stream_; }
 
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
@@ -64,7 +63,7 @@ class XlaTransferManager {
 
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
-  perftools::gputools::Stream* stream_;
+  se::Stream* stream_;
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
   // Transfer manager, for marshalling data to and from the device.
@@ -78,8 +77,8 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(perftools::gputools::Stream* stream,
-                            xla::LocalClient* client, bool transfer_as_literal);
+  explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
+                            bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
@@ -87,9 +86,7 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
-  perftools::gputools::Stream* stream() const override {
-    return manager_.stream();
-  }
+  se::Stream* stream() const override { return manager_.stream(); }
 
  private:
   XlaTransferManager manager_;
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3520501c1a3..2a7f04271d4 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -32,13 +32,12 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+namespace tensorflow {
 namespace {
-namespace gpu = perftools::gputools;
 using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 }  // anonymous namespace
 
-namespace tensorflow {
 std::map SnapshotResourceVariables(OpKernelContext* ctx,
                                                         int num_variables) {
   std::map snapshot;
@@ -57,24 +56,23 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx,
   return snapshot;
 }
 
-XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped)
+XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
     : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr XlaAllocator::Allocate(
+xla::StatusOr XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
   if (data == nullptr) {
     return errors::ResourceExhausted("Out of memory while trying to allocate ",
                                      size, " bytes.");
   } else {
-    return gpu::DeviceMemoryBase(data, size);
+    return se::DeviceMemoryBase(data, size);
   }
 }
 
-Status XlaAllocator::Deallocate(int device_ordinal,
-                                gpu::DeviceMemoryBase* mem) {
+Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
   wrapped_->DeallocateRaw(mem->opaque());
   return Status::OK();
 }
@@ -102,7 +100,7 @@ ScopedShapedBuffer ExtractSubShapedBuffer(
                                  /*target_base_index=*/{});
   for (auto& index_to_buffer : shape_tree) {
     if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
-      index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
+      index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0);
     }
   }
   return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
@@ -149,7 +147,7 @@ void XlaComputationLaunchContext::PopulateInputs(
           << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
-      gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
+      se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
       arg_buffers_[i] = xla::MakeUnique(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
@@ -162,7 +160,7 @@ void XlaComputationLaunchContext::PopulateInputs(
 void XlaComputationLaunchContext::PopulateOutputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
     ScopedShapedBuffer output) {
-  gpu::Stream* stream =
+  se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
   // Computation output should always be a tuple.
@@ -227,7 +225,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
-      gpu::DeviceMemoryBase buffer = output.buffer({output_num});
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
       if (allocate_xla_tensors_) {
         Tensor* output_tensor;
         OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
@@ -238,7 +236,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -258,7 +256,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
 
-    gpu::DeviceMemoryBase buffer = output.buffer({output_num});
+    se::DeviceMemoryBase buffer = output.buffer({output_num});
 
     Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
@@ -288,7 +286,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 26dcaa8a51d..8a6ff3b0c75 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -46,13 +46,11 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx,
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(const perftools::gputools::Platform* platform,
-               Allocator* wrapped);
+  XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal,
-                    perftools::gputools::DeviceMemoryBase* mem) override;
+  xla::StatusOr Allocate(int device_ordinal, uint64 size,
+                                               bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override;
 
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
@@ -126,8 +124,7 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
   static Tensor MakeTensor(DataType dtype, const TensorShape& shape,
-                           perftools::gputools::DeviceMemoryBase buffer,
-                           Allocator* allocator) {
+                           se::DeviceMemoryBase buffer, Allocator* allocator) {
     size_t expected_size = shape.num_elements() * DataTypeSize(dtype);
     auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size,
                                               buffer.size(), allocator);
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 84b2835c406..ce6456880bc 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -31,16 +31,15 @@ namespace tensorflow {
   return FromTensor(const_cast(tensor));
 }
 
-/*static*/ perftools::gputools::DeviceMemoryBase
-XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) {
+/*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor(
+    const Tensor& tensor) {
   const XlaTensor* xla_tensor = FromTensor(&tensor);
   if (xla_tensor) {
     CHECK(xla_tensor->has_shaped_buffer());
     return xla_tensor->shaped_buffer().root_buffer();
   } else {
-    return perftools::gputools::DeviceMemoryBase(
-        const_cast(tensor.tensor_data().data()),
-        tensor.tensor_data().size());
+    return se::DeviceMemoryBase(const_cast(tensor.tensor_data().data()),
+                                tensor.tensor_data().size());
   }
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 2334fd272be..922a9189731 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -43,8 +43,7 @@ class XlaTensor {
   // which case the returned value is shaped_buffer()->root_buffer(), or a
   // normal Tensor in which case the returned value is
   // {tensor.tensor_data().data(), tensor.tensor_data().size}.
-  static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor(
-      const Tensor& tensor);
+  static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor);
 
   // Assign the internal ShapedBuffer to new memory for the given dtype and
   // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 1e8f011b5d8..2458f7554af 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -247,7 +247,7 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 };
 
 #if GOOGLE_CUDA
-namespace dnn = ::perftools::gputools::dnn;
+namespace dnn = se::dnn;
 
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd507..1be97ae3d6e 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -59,7 +59,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
       delta_h, scale_s, scale_v, tranformation_matrix.flat().data(),
       tranformation_matrix.flat().size());
   // Call cuBlas C = A * B directly.
-  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto no_transpose = se::blas::Transpose::kNoTranspose;
   auto a_ptr =
       AsDeviceMemory(input->flat().data(), input->flat().size());
   auto b_ptr = AsDeviceMemory(tranformation_matrix.flat().data(),
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 8dca90a1e34..ed22ee667f1 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -73,7 +73,7 @@ limitations under the License.
  */
 
 template 
-using StatusOr = perftools::gputools::port::StatusOr;
+using StatusOr = se::port::StatusOr;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
index a051ab00046..475297ca921 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc
@@ -74,7 +74,7 @@ limitations under the License.
  */
 
 template 
-using StatusOr = perftools::gputools::port::StatusOr;
+using StatusOr = se::port::StatusOr;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index b9b482a6981..b1cb89391ce 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
 // a background thread that calls NcclManager::LoopKernelLaunches.
@@ -37,11 +37,11 @@ struct NcclManager::NcclStream {
     cv.notify_all();
   }
 
-  perftools::gputools::StreamExecutor* executor = nullptr;
+  se::StreamExecutor* executor = nullptr;
 
   // The stream on which to run the nccl collective.
   // This is a different stream than the tensorflow compute stream.
-  std::unique_ptr stream;
+  std::unique_ptr stream;
 
   // See NcclManager::LoopKernelLaunches for information on these.
   std::unique_ptr thread;
@@ -95,9 +95,8 @@ ncclDataType_t ToNcclType(DataType t) {
 // A participant in a Collective.  See  below.
 struct NcclManager::Participant {
   Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr,
-              perftools::gputools::Stream* tensor_stream,
-              perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-              NcclManager::DoneCallback done_callback)
+              se::Stream* tensor_stream, se::StreamExecutor* executor,
+              int gpu_device_id, NcclManager::DoneCallback done_callback)
       : in_t(in_t),
         out_t(out_t),
         event_mgr(event_mgr),
@@ -121,11 +120,11 @@ struct NcclManager::Participant {
   EventMgr* const event_mgr;
 
   // Owned by the caller, who must keep it live until  is called.
-  perftools::gputools::Stream* const tensor_stream;
+  se::Stream* const tensor_stream;
 
   // Matches the executor in CommunicatorMember::stream. Expected to be live for
   // process lifetime.
-  perftools::gputools::StreamExecutor* const executor = nullptr;
+  se::StreamExecutor* const executor = nullptr;
 
   const int gpu_device_id;
 
@@ -245,7 +244,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     if (nccl_stream == nullptr) {
       nccl_stream = new NcclStream();
       nccl_stream->executor = executor;
-      nccl_stream->stream.reset(new perftools::gputools::Stream(executor));
+      nccl_stream->stream.reset(new se::Stream(executor));
       nccl_stream->stream->Init();
 
       streams.emplace_back(nccl_stream);
@@ -300,10 +299,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
                                  ncclRedOp_t reduction_op,
-                                 perftools::gputools::StreamExecutor* executor,
+                                 se::StreamExecutor* executor,
                                  int gpu_device_id, EventMgr* event_mgr,
-                                 perftools::gputools::Stream* tensor_stream,
-                                 const Tensor* in_t, Tensor* out_t,
+                                 se::Stream* tensor_stream, const Tensor* in_t,
+                                 Tensor* out_t,
                                  const DoneCallback& done_callback) {
   std::unique_ptr participant(
       new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
@@ -312,11 +311,12 @@ void NcclManager::AddToAllReduce(int num_devices, const string& key,
                  kAllReduce, reduction_op);
 }
 
-void NcclManager::AddBroadcastSend(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    const Tensor* in_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastSend(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream,
+                                   const Tensor* in_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -325,11 +325,11 @@ void NcclManager::AddBroadcastSend(
                  kBroadcast, ncclSum /* unused */);
 }
 
-void NcclManager::AddBroadcastRecv(
-    int num_devices, const string& key,
-    perftools::gputools::StreamExecutor* executor, int gpu_device_id,
-    EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream,
-    Tensor* out_t, DoneCallback done_callback) {
+void NcclManager::AddBroadcastRecv(int num_devices, const string& key,
+                                   se::StreamExecutor* executor,
+                                   int gpu_device_id, EventMgr* event_mgr,
+                                   se::Stream* tensor_stream, Tensor* out_t,
+                                   DoneCallback done_callback) {
   std::unique_ptr participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
                       executor, gpu_device_id, std::move(done_callback)));
@@ -339,9 +339,8 @@ void NcclManager::AddBroadcastRecv(
 
 void NcclManager::AddReduceSend(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t,
                                 DoneCallback done_callback) {
   std::unique_ptr participant(
@@ -353,9 +352,8 @@ void NcclManager::AddReduceSend(int num_devices, const string& key,
 
 void NcclManager::AddReduceRecv(int num_devices, const string& key,
                                 ncclRedOp_t reduction_op,
-                                perftools::gputools::StreamExecutor* executor,
-                                int gpu_device_id, EventMgr* event_mgr,
-                                perftools::gputools::Stream* tensor_stream,
+                                se::StreamExecutor* executor, int gpu_device_id,
+                                EventMgr* event_mgr, se::Stream* tensor_stream,
                                 const Tensor* in_t, Tensor* out_t,
                                 DoneCallback done_callback) {
   std::unique_ptr participant(
@@ -444,7 +442,7 @@ void NcclManager::RunCollective(const string& key, Collective* collective) {
 }
 
 void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
-  perftools::gputools::Stream* comm_stream = nccl_stream->stream.get();
+  se::Stream* comm_stream = nccl_stream->stream.get();
   ScopedActivateExecutorContext scoped_context(nccl_stream->executor);
   const cudaStream_t* cu_stream = reinterpret_cast(
       comm_stream->implementation()->CudaStreamMemberHack());
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 6ff8cea84eb..57a96c5d334 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -55,41 +55,34 @@ class NcclManager {
   // is also the stream that will use the produced data;  is
   // not called until the next kernel launched on  would see the data.
   void AddToAllReduce(int num_devices, const string& key,
-                      ncclRedOp_t reduction_op,
-                      perftools::gputools::StreamExecutor* executor,
+                      ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                       int gpu_device_id, EventMgr* event_mgr,
-                      perftools::gputools::Stream* tensor_stream,
-                      const Tensor* in_t, Tensor* out_t,
-                      const DoneCallback& done_callback);
+                      se::Stream* tensor_stream, const Tensor* in_t,
+                      Tensor* out_t, const DoneCallback& done_callback);
 
   // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender
   // to all receivers.
   void AddBroadcastSend(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         const Tensor* in_t, DoneCallback done_callback);
   void AddBroadcastRecv(int num_devices, const string& key,
-                        perftools::gputools::StreamExecutor* executor,
-                        int gpu_device_id, EventMgr* event_mgr,
-                        perftools::gputools::Stream* tensor_stream,
+                        se::StreamExecutor* executor, int gpu_device_id,
+                        EventMgr* event_mgr, se::Stream* tensor_stream,
                         Tensor* out_t, DoneCallback done_callback);
 
   // AddReduceSend and AddReduceRecv combine to sent data from all senders
   // to one receiver.
   void AddReduceSend(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
                      int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, DoneCallback done_callback);
-  void AddReduceRecv(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op,
-                     perftools::gputools::StreamExecutor* executor,
-                     int gpu_device_id, EventMgr* event_mgr,
-                     perftools::gputools::Stream* tensor_stream,
-                     const Tensor* in_t, Tensor* out_t,
+                     se::Stream* tensor_stream, const Tensor* in_t,
                      DoneCallback done_callback);
+  void AddReduceRecv(int num_devices, const string& key,
+                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
+                     int gpu_device_id, EventMgr* event_mgr,
+                     se::Stream* tensor_stream, const Tensor* in_t,
+                     Tensor* out_t, DoneCallback done_callback);
 
  private:
   enum CollectiveType {
@@ -123,8 +116,7 @@ class NcclManager {
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
-  std::map>>
+  std::map>>
       device_to_comm_streams_ GUARDED_BY(mu_);
 
   std::vector> communicators_;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 06ca65e33ad..4d8d922cb42 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -175,11 +175,9 @@ class NcclManagerTest : public ::testing::Test {
                                     nullptr /* step_resource_manager */);
   }
 
-  static perftools::gputools::DeviceMemory AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast(cuda_memory));
-    perftools::gputools::DeviceMemory typed(wrapped);
+  static se::DeviceMemory AsDeviceMemory(const Scalar* cuda_memory) {
+    se::DeviceMemoryBase wrapped(const_cast(cuda_memory));
+    se::DeviceMemory typed(wrapped);
     return typed;
   }
 
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index 03006dab323..45d22b739b8 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template 
-perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory));
-  perftools::gputools::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -41,9 +41,8 @@ void TensorCuBlasGemm::operator()(OpKernelContext* ctx, bool transa,
                                      T alpha, const T* a, int lda, const T* b,
                                      int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
-  perftools::gputools::blas::Transpose trans[] = {
-      perftools::gputools::blas::Transpose::kNoTranspose,
-      perftools::gputools::blas::Transpose::kTranspose};
+  se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                 se::blas::Transpose::kTranspose};
 
   auto a_ptr = AsDeviceMemory(a);
   auto b_ptr = AsDeviceMemory(b);
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index b32371b642f..53ba7badcae 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 static ::tensorflow::tensorrt::Logger logger;
-namespace gpu = ::perftools::gputools;
 using IRuntime = nvinfer1::IRuntime;
 using Dims = nvinfer1::Dims;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index c2c0b020c74..ad142e9982a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 // A GPU memory allocator that implements a 'best-fit with coalescing'
@@ -52,7 +50,7 @@ class GPUBFCAllocator : public BFCAllocator {
 class GPUMemAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -68,13 +66,13 @@ class GPUMemAllocator : public SubAllocator {
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
-      gpu::DeviceMemoryBase gpu_ptr(ptr);
+      se::DeviceMemoryBase gpu_ptr(ptr);
       stream_exec_->Deallocate(&gpu_ptr);
     }
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 208697361d2..5043fac7974 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -44,7 +44,7 @@ class GPUcudaMallocAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUcudaMallocAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index b0ca7e31096..4ff5fab866a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -40,8 +40,7 @@ int64* NewMask(int64 word) {
 int64* before_mask = NewMask(0xabababababababab);
 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
-bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-               int64* mask) {
+bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   gpu::DeviceMemory gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
@@ -62,8 +61,7 @@ bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
   return ok;
 }
 
-void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
-              int64* mask) {
+void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   gpu::DeviceMemory gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
   if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
     LOG(FATAL) << "Could not copy debug mask";
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index adce3a84368..c49ec2a5662 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -55,7 +55,7 @@ class GPUDebugAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUDebugAllocator);
 };
@@ -81,7 +81,7 @@ class GPUNanResetAllocator : public VisitableAllocator {
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
 
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
+  se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUNanResetAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0b9e8f9cc2d..f7248ca79db 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -297,9 +297,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
     }
     scratch_.push_back(static_cast(scratch_buffer));
 
-    perftools::gputools::DeviceMemory mem(
-        perftools::gputools::DeviceMemoryBase(scratch_buffer,
-                                              scratch_buffer_size));
+    se::DeviceMemory mem(
+        se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
     bool ok = executor_->SynchronousMemZero(
         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index af6a59a85df..48984484760 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
-EventMgr::EventMgr(gpu::StreamExecutor* se, const GPUOptions& gpu_options)
+EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
       deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
                                     ? gpu_options.deferred_deletion_bytes()
@@ -94,7 +92,7 @@ void EventMgr::StopPollingLoop() {
   }
 }
 
-void EventMgr::ThenDeleteTensors(perftools::gputools::Stream* stream,
+void EventMgr::ThenDeleteTensors(se::Stream* stream,
                                  const TensorReferenceVector& tensors) {
   mutex_lock l(mu_);
   // TODO(jeff): We currently keep one accumulated_tensors_ object.
@@ -152,16 +150,16 @@ void EventMgr::PollLoop() {
   polling_stopped_->Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
+void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
-    free_events_.push_back(new gpu::Event(exec_));
+    free_events_.push_back(new se::Event(exec_));
     free_events_.back()->Init();
   }
-  gpu::Event* e = free_events_.back();
+  se::Event* e = free_events_.back();
   free_events_.pop_back();
   stream->ThenRecordEvent(e);
   iu.event = e;
@@ -199,18 +197,18 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
   // the first non-complete record that is still pending.
   for (auto& iu : used_events_) {
     if (iu.event == nullptr) continue;
-    gpu::Event::Status s = iu.event->PollForStatus();
+    se::Event::Status s = iu.event->PollForStatus();
     switch (s) {
-      case gpu::Event::Status::kUnknown:
-      case gpu::Event::Status::kError:
+      case se::Event::Status::kUnknown:
+      case se::Event::Status::kError:
         // We don't expect to see these.  Someday maybe propagate
         // a Status error, but for now fail hard.
         LOG(FATAL) << "Unexpected Event status: " << static_cast(s);
         break;
-      case gpu::Event::Status::kPending:
+      case se::Event::Status::kPending:
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
-      case gpu::Event::Status::kComplete:
+      case se::Event::Status::kComplete:
         // Make a copy of the InUse record so we can free it after releasing
         // the lock
         to_free->push_back(iu);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index fd5f50ca4ea..b26f88a201c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -44,14 +44,13 @@ class GPUOptions;
 // Events are recorded.
 class EventMgr {
  public:
-  EventMgr(perftools::gputools::StreamExecutor* se,
-           const GPUOptions& gpu_options);
+  EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
 
   ~EventMgr();
 
   // Releases the references on the elements of "tensors" as soon as
   // all events currently enqueued on "stream" have completed.
-  void ThenDeleteTensors(perftools::gputools::Stream* stream,
+  void ThenDeleteTensors(se::Stream* stream,
                          const TensorReferenceVector& tensors);
 
   struct BufRec {
@@ -65,8 +64,7 @@ class EventMgr {
 
   // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw()
   // on it as soon as all events currently enqueued on *stream have completed.
-  inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
-                               BufRec bufrec) {
+  inline void ThenDeleteBuffer(se::Stream* stream, BufRec bufrec) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -76,8 +74,7 @@ class EventMgr {
     FreeMemory(to_free);
   }
 
-  inline void ThenExecute(perftools::gputools::Stream* stream,
-                          std::function func) {
+  inline void ThenExecute(se::Stream* stream, std::function func) {
     ToFreeVector to_free;
     {
       mutex_lock l(mu_);
@@ -89,7 +86,7 @@ class EventMgr {
 
  private:
   friend class TEST_EventMgrHelper;
-  perftools::gputools::StreamExecutor* const exec_;
+  se::StreamExecutor* const exec_;
   const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
   mutex mu_;
@@ -98,7 +95,7 @@ class EventMgr {
   void FlushAccumulatedTensors() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   struct InUse {
-    perftools::gputools::Event* event;
+    se::Event* event;
     TensorReferenceVector* mem;
     BufRec bufrec;
     std::function func;
@@ -130,22 +127,21 @@ class EventMgr {
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
+  void QueueInUse(se::Stream* stream, InUse in_use)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors)
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
+  void QueueBuffer(se::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
   }
 
-  void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void QueueFunc(se::Stream* stream, std::function func)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     QueueInUse(stream, {nullptr, nullptr, BufRec(), std::move(func)});
   }
 
@@ -166,10 +162,10 @@ class EventMgr {
   void StopPollingLoop();
 
   // A stack of unused events
-  std::vector free_events_ GUARDED_BY(mu_);
+  std::vector free_events_ GUARDED_BY(mu_);
 
   // Buffered list of tensors waiting to have an event queued for deletion
-  perftools::gputools::Stream* accumulated_stream_ GUARDED_BY(mu_);
+  se::Stream* accumulated_stream_ GUARDED_BY(mu_);
   TensorReferenceVector* accumulated_tensors_ GUARDED_BY(mu_);
   // Sum of the TotalBytes() of the tensors in "accumulated_tensors_"
   int64 accumulated_tensor_bytes_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 3ad0b0eb85f..1d4ad957b94 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 class TEST_EventMgrHelper {
@@ -47,8 +45,7 @@ class TEST_EventMgrHelper {
     return em_->free_events_.size();
   }
 
-  void QueueTensors(perftools::gputools::Stream* stream,
-                    TensorReferenceVector* tensors) {
+  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors) {
     mutex_lock l(em_->mu_);
     em_->QueueTensors(stream, tensors);
   }
@@ -121,7 +118,7 @@ TEST(EventMgr, DelayedPolling) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
-  std::unique_ptr stream(new gpu::Stream(stream_exec));
+  std::unique_ptr stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -153,7 +150,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr stream(new gpu::Stream(stream_exec));
+  std::unique_ptr stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -170,7 +167,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr stream(new gpu::Stream(stream_exec));
+  std::unique_ptr stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -189,8 +186,8 @@ TEST(EventMgr, StreamSwitchingFlushesImmediately) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr stream1(new gpu::Stream(stream_exec));
-  std::unique_ptr stream2(new gpu::Stream(stream_exec));
+  std::unique_ptr stream1(new se::Stream(stream_exec));
+  std::unique_ptr stream2(new se::Stream(stream_exec));
   stream1->Init();
   stream2->Init();
   TensorReferenceVector v1;
@@ -211,7 +208,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr stream(new gpu::Stream(stream_exec));
+  std::unique_ptr stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
@@ -234,7 +231,7 @@ TEST(EventMgr, NonEmptyShutdown) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
-  std::unique_ptr stream(new gpu::Stream(stream_exec));
+  std::unique_ptr stream(new se::Stream(stream_exec));
   CHECK(stream.get());
   stream->Init();
   for (int i = 0; i < 5; ++i) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
index aa23e3cc614..ff96891a2ab 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.cc
@@ -26,12 +26,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 Status ValidateGPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     return StreamExecutorUtil::ConvertStatus(result.status());
   }
@@ -39,8 +37,8 @@ Status ValidateGPUMachineManager() {
   return Status::OK();
 }
 
-gpu::Platform* GPUMachineManager() {
-  auto result = gpu::MultiPlatformManager::PlatformWithName("CUDA");
+se::Platform* GPUMachineManager() {
+  auto result = se::MultiPlatformManager::PlatformWithName("CUDA");
   if (!result.ok()) {
     LOG(FATAL) << "Could not find Platform with name CUDA";
     return nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 5214ceaae57..7ba853fa51b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -55,19 +55,15 @@ limitations under the License.
 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
 extern bool FLAGS_brain_gpu_record_mem_types;
 
-using perftools::gputools::DeviceMemoryBase;
-using perftools::gputools::Stream;
-
 namespace tensorflow {
 
-// TODO(b/77980417): Remove this and use the regular tensorflow::se alias once
-// that's available.
-namespace gpu = ::stream_executor;
+using se::DeviceMemoryBase;
+using se::Stream;
 
 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
                    const Tensor* dst,
                    const DeviceBase::GpuDeviceInfo** dev_info,
-                   gpu::Stream** stream) {
+                   se::Stream** stream) {
   if (device == nullptr) {
     return errors::Internal("Unexpected null device.");
   }
@@ -122,7 +118,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                               StatusCallback done) {
   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -197,7 +193,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
                                  const Tensor* input, Tensor* output,
                                  StatusCallback done) {
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
                          &send_stream);
   if (!s.ok()) {
@@ -264,7 +260,7 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
                                  StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToCPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
                          &dev_info, &send_stream);
   if (!s.ok()) {
@@ -309,7 +305,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  StatusCallback done) {
   VLOG(1) << "CopyCPUTensorToGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* recv_stream = nullptr;
+  se::Stream* recv_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
                          &dev_info, &recv_stream);
   if (!s.ok()) {
@@ -432,7 +428,7 @@ void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
                                      StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToSameGPU";
   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
-  gpu::Stream* send_stream = nullptr;
+  se::Stream* send_stream = nullptr;
   Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
                          dst_gpu_tensor, &dev_info, &send_stream);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 337dc89895c..0c69a17eaa8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -74,10 +74,9 @@ class GPUUtil {
   // NOTE: will be removed soon, see StreamExecutorUtil::AsDeviceMemory
   // instead.
   template 
-  static perftools::gputools::DeviceMemory AsDeviceMemory(const Tensor& t) {
+  static se::DeviceMemory AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast(const_cast(DMAHelper::base(&t)));
-    return perftools::gputools::DeviceMemory(
-        perftools::gputools::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory(se::DeviceMemoryBase(ptr, t.TotalBytes()));
   }
 
   // Computes a checksum over the contents of "tensor", which is allocated
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 91ce830df85..310158aba1b 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -181,7 +181,7 @@ class BasicCPUAllocator : public SubAllocator {
 class CUDAHostAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
+  explicit CUDAHostAllocator(se::StreamExecutor* stream_exec)
       : stream_exec_(stream_exec) {
     CHECK(stream_exec_ != nullptr);
   }
@@ -206,7 +206,7 @@ class CUDAHostAllocator : public SubAllocator {
   }
 
  private:
-  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+  se::StreamExecutor* stream_exec_;  // not owned, non-null
 
   TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 85555955e37..a4c8d5fe86c 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 namespace {
 
 TEST(PoolAllocatorTest, ZeroSizeBuffers) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -44,12 +42,12 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
 }
 
 TEST(PoolAllocatorTest, ZeroSizePool) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -77,12 +75,12 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
 }
 
 TEST(PoolAllocatorTest, Alignment) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
@@ -123,12 +121,12 @@ TEST(PoolAllocatorTest, AutoResize) {
 }
 
 TEST(PoolAllocatorTest, CudaHostAllocator) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
 
@@ -200,12 +198,12 @@ TEST(PoolAllocatorTest, Pow2Rounder) {
 }
 
 TEST(PoolAllocatorTest, Name) {
-  gpu::Platform* platform =
-      gpu::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new CUDAHostAllocator(
-          platform->GetExecutor(gpu::StreamExecutorConfig(/*ordinal=*/0))
+          platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
               .ValueOrDie()),
       new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 38a18cd0877..a1ad2c2277d 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -63,8 +63,8 @@ class GPUDeviceContext : public DeviceContext {
                              Device* device, Tensor* cpu_tensor,
                              StatusCallback done) override;
 
-  void MaintainLifetimeOnStream(
-      const Tensor* t, perftools::gputools::Stream* stream) const override {}
+  void MaintainLifetimeOnStream(const Tensor* t,
+                                se::Stream* stream) const override {}
 
  private:
   int stream_id_;
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 2be894a08b2..3268697671b 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -31,15 +31,14 @@ int GetNumAvailableGPUs() {
   int num_eligible_gpus = 0;
 #if GOOGLE_CUDA
   if (ValidateGPUMachineManager().ok()) {
-    perftools::gputools::Platform* gpu_manager = GPUMachineManager();
+    se::Platform* gpu_manager = GPUMachineManager();
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
         auto exec_status = gpu_manager->ExecutorForDevice(i);
         if (exec_status.ok()) {
-          perftools::gputools::StreamExecutor* se = exec_status.ValueOrDie();
-          const perftools::gputools::DeviceDescription& desc =
-              se->GetDeviceDescription();
+          se::StreamExecutor* se = exec_status.ValueOrDie();
+          const se::DeviceDescription& desc = se->GetDeviceDescription();
           int min_gpu_core_count = 8;
           if (desc.core_count() >= min_gpu_core_count) {
             num_eligible_gpus++;
@@ -57,10 +56,9 @@ int GetNumAvailableGPUs() {
 int64 AvailableGPUMemory(int gpu_id) {
 #if GOOGLE_CUDA
   // Look up the device, to see its attributes.
-  perftools::gputools::Platform* gpu_platform = GPUMachineManager();
+  se::Platform* gpu_platform = GPUMachineManager();
   CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
-  perftools::gputools::StreamExecutor* se =
-      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+  se::StreamExecutor* se = gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
   int64 total_memory, available_memory;
   CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
 
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index c581d1451f0..ba38e1a188f 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -156,10 +156,10 @@ class AvgPoolingOp : public UnaryOp {
     TensorShape output_shape = params.forward_output_shape();
 
     if (data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape,
-          /*propagate_nans=*/false);
+      DnnPoolingOp::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               output_shape,
+                               /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -417,10 +417,10 @@ class AvgPoolingGradOp : public OpKernel {
       output_shape.AddDim(shape_vec(i));
     }
 
-    DnnPoolingGradOp::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-        stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape, /*propagate_nans=*/false);
+    DnnPoolingGradOp::Compute(context, se::dnn::PoolingMode::kAverage,
+                                 ksize_, stride_, padding_, data_format_,
+                                 nullptr, nullptr, out_backprop, output_shape,
+                                 /*propagate_nans=*/false);
   }
 
  private:
@@ -547,10 +547,10 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                 output->flat().data(),       // bottom_diff
                                 context->eigen_gpu_device());   // d
     } else {
-      DnnPoolingGradOp::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape, /*propagate_nans=*/false);
+      DnnPoolingGradOp::Compute(context, se::dnn::PoolingMode::kAverage,
+                                   ksize_, stride_, padding_, data_format_,
+                                   nullptr, nullptr, out_backprop, output_shape,
+                                   /*propagate_nans=*/false);
     }
   }
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 43e716c542a..a1c03f99181 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -245,35 +245,35 @@ struct LaunchBatchMatMul {
 
 namespace {
 template 
-perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory));
-  perftools::gputools::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 
-class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CublasScratchAllocator : public se::ScratchAllocator {
  public:
-  using Stream = ::perftools::gputools::Stream;
-  using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory;
+  using Stream = se::Stream;
+  using DeviceMemoryBytes = se::DeviceMemory;
 
   CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
 
   int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
 
-  perftools::gputools::port::StatusOr AllocateBytes(
+  se::port::StatusOr AllocateBytes(
       Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
 
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr(
+      return se::port::StatusOr(
           DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
-    return perftools::gputools::port::StatusOr(
+    return se::port::StatusOr(
         DeviceMemoryBytes::MakeFromByteSize(
             temporary_memory.flat().data(),
             temporary_memory.flat().size()));
@@ -289,12 +289,11 @@ template 
 struct LaunchBatchMatMul {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-    constexpr perftools::gputools::blas::Transpose kTranspose =
-        is_complex::value
-            ? perftools::gputools::blas::Transpose::kConjugateTranspose
-            : perftools::gputools::blas::Transpose::kTranspose;
-    perftools::gputools::blas::Transpose trans[] = {
-        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    constexpr se::blas::Transpose kTranspose =
+        is_complex::value ? se::blas::Transpose::kConjugateTranspose
+                                  : se::blas::Transpose::kTranspose;
+    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
+                                   kTranspose};
     const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
     const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
     const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
@@ -305,7 +304,7 @@ struct LaunchBatchMatMul {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    typedef perftools::gputools::DeviceMemory DeviceMemoryType;
+    typedef se::DeviceMemory DeviceMemoryType;
     std::vector a_device_memory;
     std::vector b_device_memory;
     std::vector c_device_memory;
@@ -340,19 +339,16 @@ struct LaunchBatchMatMul {
       // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
       // overhead of the scratch allocator and the batch interface.
       if (n == 1 &&
-          blas_transpose_b !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose &&
-          blas_transpose_a !=
-              perftools::gputools::blas::Transpose::kConjugateTranspose) {
+          blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
+          blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
         // This is a matrix*vector multiply so use GEMV to compute A * b.
         // Here we are multiplying in the natural order, so we have to flip
         // the transposition flag to compensate for the tensor being stored
         // row-major. Since GEMV doesn't provide a way to just conjugate an
         // argument, we have to defer those cases to GEMM below.
-        auto gemv_trans_a =
-            blas_transpose_a == perftools::gputools::blas::Transpose::kTranspose
-                ? perftools::gputools::blas::Transpose::kNoTranspose
-                : perftools::gputools::blas::Transpose::kTranspose;
+        auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose
+                                ? se::blas::Transpose::kNoTranspose
+                                : se::blas::Transpose::kTranspose;
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 368993c8271..9fda7169a8b 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -393,8 +393,8 @@ class BiasGradOp : public OpKernel {
     if (channel == 0) return;
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-    perftools::gputools::DeviceMemoryBase output_ptr(
-        output->flat().data(), output->NumElements() * sizeof(T));
+    se::DeviceMemoryBase output_ptr(output->flat().data(),
+                                    output->NumElements() * sizeof(T));
     stream->ThenMemZero(&output_ptr, output->NumElements() * sizeof(T));
     if (output_backprop.NumElements() > 0) {
       BiasGradGPU::compute(context->template eigen_device(),
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index d3b67f4614e..c3c0c500076 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -139,7 +139,7 @@ class CheckNumericsOp : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(context, stream != nullptr,
                       errors::Internal("No GPU stream available."), done);
 
-    perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
+    se::DeviceMemoryBase abnormal_detected_ptr(
         abnormal_detected.flat().data(),
         abnormal_detected.flat().size());
     stream->ThenMemset32(&abnormal_detected_ptr, 0,
@@ -174,8 +174,8 @@ class CheckNumericsOp : public AsyncOpKernel {
     TensorReference abnormal_detected_ref(abnormal_detected);
     auto check_cb = [this, stream, abnormal_detected_ref,
                      abnormal_detected_host, context, done]() {
-      ::perftools::gputools::cuda::ScopedActivateExecutorContext
-          scoped_activation{stream->parent()};
+      se::cuda::ScopedActivateExecutorContext scoped_activation{
+          stream->parent()};
       auto abnormal_detected_host_flat = abnormal_detected_host.flat();
       int is_nan = abnormal_detected_host_flat(0);
       int is_inf = abnormal_detected_host_flat(1);
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index f3b91494b97..ef1e73e5ab1 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -532,7 +532,7 @@ struct ConvBackwardFilterAutoTuneGroup {
   static string name() { return "ConvBwdFilter"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdFilter;
 
 // Backprop for filter.
@@ -636,9 +636,9 @@ void LaunchConv2DBackpropFilterOp::operator()(
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* filter_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector dilations(4, 1);
   dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
@@ -721,9 +721,9 @@ void LaunchConv2DBackpropFilterOp::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -751,9 +751,9 @@ void LaunchConv2DBackpropFilterOp::operator()(
 
     bool blas_launch_status =
         stream
-            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                           perftools::gputools::blas::Transpose::kTranspose, n,
-                           m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+            ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                           se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                           b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
             .ok();
     if (!blas_launch_status) {
       ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -787,24 +787,24 @@ void LaunchConv2DBackpropFilterOp::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 66d15c6e787..35f2676023a 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -604,7 +604,7 @@ struct ConvBackwardDataAutoTuneGroup {
   static string name() { return "ConvBwdData"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
     AutoTuneConvBwdData;
 
 // Backprop for input.
@@ -705,9 +705,9 @@ void LaunchConv2DBackpropInputOp::operator()(
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* in_backprop, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
 
   std::vector strides(4, 1);
   std::vector dilations(4, 1);
@@ -778,8 +778,8 @@ void LaunchConv2DBackpropInputOp::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(),
                                 in_backprop->template flat().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -810,8 +810,8 @@ void LaunchConv2DBackpropInputOp::operator()(
     auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(),
                                 in_backprop->template flat().size());
 
-    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     bool blas_launch_status =
         stream
@@ -841,24 +841,24 @@ void LaunchConv2DBackpropInputOp::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
       .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
       .set_feature_map_count(dims.in_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(dims.batch_size)
       .set_height(dims.spatial_dims[0].output_size)
       .set_width(dims.spatial_dims[1].output_size)
       .set_feature_map_count(dims.out_depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 092e859a5be..9edc6d416e3 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,7 +35,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -468,7 +468,7 @@ struct Conv3dBackwardDataAutoTuneGroup {
   static string name() { return "Conv3dBwdData"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
 
     AutoTuneConv3dBwdData;
 template 
@@ -554,8 +554,8 @@ class Conv3DBackpropInputOp : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(),
                                   in_backprop->template flat().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -582,8 +582,8 @@ class Conv3DBackpropInputOp : public OpKernel {
       auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(),
                                   in_backprop->template flat().size());
 
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto transpose = se::blas::Transpose::kTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
 
       bool blas_launch_status =
           stream
@@ -629,27 +629,27 @@ class Conv3DBackpropInputOp : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
         .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
         .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -725,9 +725,9 @@ class Conv3DBackpropInputOp : public OpKernel {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
@@ -839,7 +839,7 @@ struct Conv3dBackwardFilterAutoTuneGroup {
   static string name() { return "Conv3dBwdFilter"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3dBwdFilter;
 
 template 
@@ -941,9 +941,9 @@ class Conv3DBackpropFilterOp : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -967,9 +967,9 @@ class Conv3DBackpropFilterOp : public OpKernel {
 
       bool blas_launch_status =
           stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+              ->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                             se::blas::Transpose::kTranspose, n, m, k, 1.0f,
+                             b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
               .ok();
       if (!blas_launch_status) {
         context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
@@ -1014,7 +1014,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X,
                          GetTensorDim(compatible_input, data_format_, '2'))
@@ -1023,21 +1023,21 @@ class Conv3DBackpropFilterOp : public OpKernel {
         .set_spatial_dim(DimIndex::Z,
                          GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(batch)
         .set_spatial_dim(DimIndex::X, output_cols)
         .set_spatial_dim(DimIndex::Y, output_rows)
         .set_spatial_dim(DimIndex::Z, output_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_size[2])
         .set_spatial_dim(DimIndex::Y, filter_size[1])
         .set_spatial_dim(DimIndex::Z, filter_size[0])
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -1121,9 +1121,9 @@ class Conv3DBackpropFilterOp : public OpKernel {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index f0888c655fe..c6d36b40fe7 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -475,7 +475,7 @@ struct ConvAutoTuneGroup {
   static string name() { return "Conv"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv;
 
 template 
@@ -484,9 +484,9 @@ void LaunchConv2DOp::operator()(
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     Tensor* output, TensorFormat data_format) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmDesc;
-  using perftools::gputools::dnn::ProfileResult;
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
@@ -514,7 +514,7 @@ void LaunchConv2DOp::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat().data(),
                                 output->template flat().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -543,7 +543,7 @@ void LaunchConv2DOp::operator()(
     auto c_ptr = AsDeviceMemory(output->template flat().data(),
                                 output->template flat().size());
 
-    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
     bool blas_launch_status =
         stream
             ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
@@ -629,24 +629,24 @@ void LaunchConv2DOp::operator()(
   CHECK(padding_rows >= 0 && padding_cols >= 0)
       << "Negative row or col paddings: (" << padding_rows << ", "
       << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
       .set_height(in_rows)
       .set_width(in_cols)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(out_batch)
       .set_height(out_rows)
       .set_width(out_cols)
       .set_feature_map_count(out_depths)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(filter.dim_size(0))
       .set_input_filter_width(filter.dim_size(1))
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 48dd3c9eb03..9ec16be67d8 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
-using perftools::gputools::dnn::DimIndex;
+using stream_executor::dnn::DimIndex;
 #endif
 
 namespace tensorflow {
@@ -192,7 +192,7 @@ struct Conv3dAutoTuneGroup {
   static string name() { return "Conv3d"; }
 };
 typedef AutoTuneSingleton
+                          se::dnn::AlgorithmConfig>
     AutoTuneConv3d;
 
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
@@ -250,7 +250,7 @@ struct LaunchConvOp {
       auto c_ptr = AsDeviceMemory(output->template flat().data(),
                                   output->template flat().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -277,7 +277,7 @@ struct LaunchConvOp {
       auto c_ptr = AsDeviceMemory(output->template flat().data(),
                                   output->template flat().size());
 
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+      auto no_transpose = se::blas::Transpose::kNoTranspose;
       bool blas_launch_status =
           stream
               ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr,
@@ -346,27 +346,27 @@ struct LaunchConvOp {
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
         << pad_planes << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc(3);
+    se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(in_batch)
         .set_feature_map_count(in_depth)
         .set_spatial_dim(DimIndex::X, in_cols)
         .set_spatial_dim(DimIndex::Y, in_rows)
         .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(in_batch)
         .set_spatial_dim(DimIndex::X, out_cols)
         .set_spatial_dim(DimIndex::Y, out_rows)
         .set_spatial_dim(DimIndex::Z, out_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc(3);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
         .set_input_feature_map_count(in_depth)
         .set_output_feature_map_count(out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc(3);
+    se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
         .set_dilation_rate(DimIndex::Z, dilations[0])
@@ -424,9 +424,9 @@ struct LaunchConvOp {
         device_id,
     };
 
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmDesc;
-    using perftools::gputools::dnn::ProfileResult;
+    using se::dnn::AlgorithmConfig;
+    using se::dnn::AlgorithmDesc;
+    using se::dnn::ProfileResult;
 
     AlgorithmConfig algorithm_config;
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 7f9cfec981f..4215c4541c7 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -36,25 +36,23 @@ int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
+class CudnnScratchAllocator : public se::ScratchAllocator {
  public:
   virtual ~CudnnScratchAllocator() {}
   CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  perftools::gputools::port::StatusOr>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return perftools::gputools::port::Status{
-          perftools::gputools::port::error::INVALID_ARGUMENT,
-          "Requested negative byte size!"};
+      return se::port::Status{se::port::error::INVALID_ARGUMENT,
+                              "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory>();
+      return se::port::StatusOr>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -62,15 +60,13 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory>();
+      return se::port::StatusOr>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory>(
+    return se::port::StatusOr>(
         AsDeviceMemory(temporary_memory.flat().data(),
                        temporary_memory.flat().size()));
   }
@@ -141,9 +137,9 @@ class ConvParameters {
   // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
   template 
   bool ShouldIncludeWinogradNonfusedAlgo(
-      perftools::gputools::StreamExecutor* stream_exec) const {
+      se::StreamExecutor* stream_exec) const {
     // Skip this check for cuDNN 7 and newer.
-    perftools::gputools::port::StatusOr> version =
+    se::port::StatusOr> version =
         stream_exec->AsDnn()->GetVersion();
     if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
       return true;
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 45cc2fbbb8b..54ef9c6fb48 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -39,17 +39,16 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 using Callback = std::function;
 
-namespace {
-
 static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
                                            const Tensor& box_index,
                                            int* num_boxes) {
@@ -753,8 +752,7 @@ inline void RunIfBoxIndexIsValid(
       context->allocate_temp(DataTypeToEnum::value, TensorShape({}),
                              &isvalid_host_tensor, alloc_attr),
       done);
-  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
-                                                sizeof(bool));
+  se::DeviceMemoryBase wrapped(isvalid_dev.data(), sizeof(bool));
   const bool status =
       stream
           ->ThenMemcpy(
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/cuda_device_array.h
index e7a5db0683e..74dc298c7a5 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/cuda_device_array.h
@@ -80,7 +80,7 @@ class CudaDeviceArrayOnHost {
     TensorReference tensor_ref(out_of_line_values_on_host_);
     TF_RETURN_IF_ERROR(context_->allocate_temp(
         DT_INT8, TensorShape{total_bytes_}, &out_of_line_values_on_gpu_));
-    perftools::gputools::DeviceMemoryBase output_values_base{
+    se::DeviceMemoryBase output_values_base{
         out_of_line_values_on_gpu_.flat().data(),
         static_cast(total_bytes_)};
     stream->ThenMemcpy(&output_values_base,
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 6cec032f949..a857bd3ce4c 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -35,8 +35,6 @@
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
-
 // The CUDA cublas_api.h API contains const-correctness errors. Instead of
 // casting away constness on our data, we instead reinterpret the CuBLAS
 // functions as what they were clearly meant to be, and thus we can call
@@ -80,10 +78,12 @@ using matinv_Z = cublasStatus_t(cublasContext*, int, const double2* const*, int,
 namespace tensorflow {
 namespace {
 
+using se::cuda::ScopedActivateExecutorContext;
+
 inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
                              const void* src, uint64 bytes) {
   auto stream = context->op_device_context()->stream();
-  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  se::DeviceMemoryBase wrapped_dst(dst);
   return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
 }
 
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index ecfa23750c2..b2e8ee23a9c 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -398,7 +398,7 @@ class DeviceLapackInfo : public ScratchSpace {
     CHECK(success != nullptr);
     HostLapackInfo copy(context(), size(), debug_info());
     auto stream = context()->op_device_context()->stream();
-    perftools::gputools::DeviceMemoryBase wrapped_src(
+    se::DeviceMemoryBase wrapped_src(
         static_cast(const_cast(this->data())));
     *success =
         stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 5939ecdf62b..d2b9c9edaab 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -31,12 +31,13 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 
 template 
-void DnnPooling3dOp::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::array& window, const std::array& stride,
-    const std::array& padding, TensorFormat data_format,
-    const Tensor& tensor_in, Tensor* output) {
+void DnnPooling3dOp::Compute(OpKernelContext* context,
+                                se::dnn::PoolingMode pooling_mode,
+                                const std::array& window,
+                                const std::array& stride,
+                                const std::array& padding,
+                                TensorFormat data_format,
+                                const Tensor& tensor_in, Tensor* output) {
   const auto in_shape = tensor_in.shape();
   const auto out_shape = output->shape();
 
@@ -67,18 +68,18 @@ void DnnPooling3dOp::Compute(
     transformed_output = *output;
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
-  perftools::gputools::dnn::BatchDescriptor input_desc(3);
+  se::dnn::BatchDescriptor input_desc(3);
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc(3);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+  se::dnn::BatchDescriptor output_desc(3);
   output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast(i);
+    const auto dim_i = static_cast(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
@@ -115,14 +116,13 @@ void DnnPooling3dOp::Compute(
 
 template 
 void DnnPooling3dGradOp::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::array& window, const std::array& stride,
     const std::array& padding,
     const std::array& output_size, TensorFormat data_format,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -186,21 +186,21 @@ void DnnPooling3dGradOp::Compute(
         transformed_output_backprop.tensor());
   }
 
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc(3);
+  se::dnn::PoolingDescriptor pooling_desc(3);
   pooling_desc.set_pooling_mode(pooling_mode);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc(3);
+  se::dnn::BatchDescriptor orig_output_desc(3);
   orig_output_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc(3);
+  se::dnn::BatchDescriptor orig_input_desc(3);
   orig_input_desc.set_count(in_batch)
       .set_feature_map_count(in_features)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   for (size_t i = 0; i < window.size(); ++i) {
-    const auto dim_i = static_cast(i);
+    const auto dim_i = static_cast(i);
     pooling_desc.set_window(dim_i, window[i]);
     pooling_desc.set_stride(dim_i, stride[i]);
     pooling_desc.set_padding(dim_i, padding[i]);
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index ff4de758451..280d697fc2a 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -38,7 +38,7 @@ template 
 class DnnPooling3dOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array& size,
                       const std::array& stride,
                       const std::array& padding,
@@ -52,7 +52,7 @@ template 
 class DnnPooling3dGradOp {
  public:
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::array& window,
                       const std::array& stride,
                       const std::array& padding,
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index a21f13a4ddc..762c2c36665 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -78,7 +78,7 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 #if GOOGLE_CUDA
 
 using GPUDevice = Eigen::GpuDevice;
-using ::perftools::gputools::StreamExecutor;
+using se::StreamExecutor;
 
 template 
 class CudnnRNNParamsSizeOp;
@@ -102,21 +102,21 @@ enum class TFRNNInputMode {
 };
 
 namespace {
-using ::perftools::gputools::DeviceMemory;
-using ::perftools::gputools::DeviceMemoryBase;
-using ::perftools::gputools::ScratchAllocator;
-using ::perftools::gputools::Stream;
-using ::perftools::gputools::dnn::AlgorithmConfig;
-using ::perftools::gputools::dnn::AlgorithmDesc;
-using ::perftools::gputools::dnn::ProfileResult;
-using ::perftools::gputools::dnn::RnnDescriptor;
-using ::perftools::gputools::dnn::RnnDirectionMode;
-using ::perftools::gputools::dnn::RnnInputMode;
-using ::perftools::gputools::dnn::RnnMode;
-using ::perftools::gputools::dnn::RnnSequenceTensorDescriptor;
-using ::perftools::gputools::dnn::RnnStateTensorDescriptor;
-using ::perftools::gputools::dnn::ToDataType;
-using ::perftools::gputools::port::StatusOr;
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::ScratchAllocator;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using se::dnn::ProfileResult;
+using se::dnn::RnnDescriptor;
+using se::dnn::RnnDirectionMode;
+using se::dnn::RnnInputMode;
+using se::dnn::RnnMode;
+using se::dnn::RnnSequenceTensorDescriptor;
+using se::dnn::RnnStateTensorDescriptor;
+using se::dnn::ToDataType;
+using se::port::StatusOr;
 
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
@@ -213,7 +213,7 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
   return DeviceMemoryBase(offset_ptr, size);
 }
 
-inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
+inline Status FromExecutorStatus(const se::port::Status& s) {
   return s.ok() ? Status::OK()
                 : Status(static_cast(
                              static_cast(s.code())),
@@ -221,17 +221,15 @@ inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
 }
 
 template 
-inline Status FromExecutorStatus(
-    const perftools::gputools::port::StatusOr& s) {
+inline Status FromExecutorStatus(const se::port::StatusOr& s) {
   return FromExecutorStatus(s.status());
 }
 
-inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
-  return s.ok() ? perftools::gputools::port::Status::OK()
-                : perftools::gputools::port::Status(
-                      static_cast(
-                          static_cast(s.code())),
-                      s.error_message());
+inline se::port::Status ToExecutorStatus(const Status& s) {
+  return s.ok() ? se::port::Status::OK()
+                : se::port::Status(static_cast(
+                                       static_cast(s.code())),
+                                   s.error_message());
 }
 
 template 
@@ -503,7 +501,7 @@ Status CreateForwardAndBackwardIODescriptors(
     std::unique_ptr* state_desc,
     std::unique_ptr* output_desc) {
   StreamExecutor* executor = context->op_device_context()->stream()->parent();
-  ::perftools::gputools::dnn::DataType data_type = ToDataType::value;
+  se::dnn::DataType data_type = ToDataType::value;
 
   const TensorShape& input_shape = model_shapes.input_shape;
   const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape;
@@ -773,7 +771,7 @@ class CudnnRNNKernelCommon : public OpKernel {
                              ScratchAllocator* dropout_state_allocator,
                              std::unique_ptr* rnn_desc) {
     StreamExecutor* executor = context->op_device_context()->stream()->parent();
-    ::perftools::gputools::dnn::DataType data_type = ToDataType::value;
+    se::dnn::DataType data_type = ToDataType::value;
     auto rnn_desc_s = executor->createRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
         model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 94989089ec9..0abd64030fb 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -1708,8 +1708,7 @@ void LaunchDepthwiseConvBackpropFilterOp::operator()(
   // Initialize the results to 0.
   int num_filter_backprop =
       args.filter_rows * args.filter_cols * args.out_depth;
-  perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
-                                                      num_filter_backprop);
+  se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
   stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9dfeccff0e8..862a97723fd 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -285,8 +285,8 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         c->allocate_temp(partition_count.dtype(), partition_count.shape(),
                          &cpu_tensor, alloc_attr),
         done);
-    perftools::gputools::DeviceMemoryBase wrapped(
-        partition_count.flat().data(), num_partitions_ * sizeof(int32));
+    se::DeviceMemoryBase wrapped(partition_count.flat().data(),
+                                 num_partitions_ * sizeof(int32));
     const bool status =
         stream
             ->ThenMemcpy(cpu_tensor.flat().data(), wrapped,
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index ab5af8caada..661bf5fc5fb 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -277,20 +277,19 @@ REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
 #undef FFT_LABEL
 
 #if GOOGLE_CUDA
-namespace gpu = ::perftools::gputools;
 
 namespace {
 template 
-gpu::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
-  gpu::DeviceMemoryBase wrapped(const_cast(cuda_memory));
-  gpu::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 
 template 
-gpu::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) {
-  gpu::DeviceMemoryBase wrapped(const_cast(cuda_memory), size * sizeof(T));
-  gpu::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory), size * sizeof(T));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 
@@ -299,19 +298,19 @@ gpu::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) {
 // the kernel finishes.
 // TODO(yangzihao): Refactor redundant code in subclasses of ScratchAllocator
 // into base class.
-class CufftScratchAllocator : public gpu::ScratchAllocator {
+class CufftScratchAllocator : public se::ScratchAllocator {
  public:
   ~CufftScratchAllocator() override {}
   CufftScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes(gpu::Stream* stream) override {
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
   }
-  gpu::port::StatusOr> AllocateBytes(
-      gpu::Stream* stream, int64 byte_size) override {
+  se::port::StatusOr> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return gpu::port::StatusOr>();
+      return se::port::StatusOr>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.no_retry_on_failure = true;
@@ -319,13 +318,13 @@ class CufftScratchAllocator : public gpu::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return gpu::port::StatusOr>();
+      return se::port::StatusOr>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return gpu::port::StatusOr>(
+    return se::port::StatusOr>(
         AsDeviceMemory(temporary_memory.flat().data(),
                        temporary_memory.flat().size()));
   }
@@ -394,9 +393,9 @@ class FFTGPUBase : public FFTBase {
 
     constexpr bool kInPlaceFft = false;
     const auto kFftType =
-        IsReal() ? (IsForward() ? gpu::fft::Type::kR2C : gpu::fft::Type::kC2R)
-                 : (IsForward() ? gpu::fft::Type::kC2CForward
-                                : gpu::fft::Type::kC2CInverse);
+        IsReal() ? (IsForward() ? se::fft::Type::kR2C : se::fft::Type::kC2R)
+                 : (IsForward() ? se::fft::Type::kC2CForward
+                                : se::fft::Type::kC2CInverse);
 
     CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
     auto plan =
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 9b4dca85113..f99dd643f76 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -251,7 +251,7 @@ struct FusedBatchNorm {
     Tensor x_maybe_transformed = x;
     Tensor x_transformed;
     Tensor y_transformed;
-    perftools::gputools::DeviceMemory y_ptr;
+    se::DeviceMemory y_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory(*y);
@@ -279,19 +279,19 @@ struct FusedBatchNorm {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto x_ptr = StreamExecutorUtil::AsDeviceMemory(x_maybe_transformed);
     auto scale_ptr = StreamExecutorUtil::AsDeviceMemory(scale);
@@ -308,7 +308,7 @@ struct FusedBatchNorm {
         StreamExecutorUtil::AsDeviceMemory(*saved_inv_var);
 
     GPUDevice d = context->eigen_device();
-    using perftools::gputools::DeviceMemory;
+    using se::DeviceMemory;
     Tensor inv_var;
     OP_REQUIRES_OK(
         context, context->allocate_temp(DataTypeToEnum::value,
@@ -390,7 +390,7 @@ struct FusedBatchNormGrad {
 
     // Outputs
     Tensor x_backprop_transformed;
-    perftools::gputools::DeviceMemory x_backprop_ptr;
+    se::DeviceMemory x_backprop_ptr;
 
     if (tensor_format == FORMAT_NCHW) {
       x_backprop_ptr = StreamExecutorUtil::AsDeviceMemory(*x_backprop);
@@ -433,19 +433,19 @@ struct FusedBatchNormGrad {
       return;
     }
 
-    perftools::gputools::dnn::BatchDescriptor x_desc;
+    se::dnn::BatchDescriptor x_desc;
     x_desc.set_count(batch_size)
         .set_feature_map_count(channels)
         .set_height(height)
         .set_width(width)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-    perftools::gputools::dnn::BatchDescriptor scale_offset_desc;
+    se::dnn::BatchDescriptor scale_offset_desc;
     scale_offset_desc.set_count(1)
         .set_feature_map_count(channels)
         .set_height(1)
         .set_width(1)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
     auto y_backprop_ptr =
         StreamExecutorUtil::AsDeviceMemory(y_backprop_maybe_transformed);
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index ffc733e6bb6..2f64619afc1 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -29,11 +29,9 @@ limitations under the License.
 namespace tensorflow {
 
 template 
-inline perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory,
-                                                           uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory typed(wrapped);
+inline se::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory), size * sizeof(T));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c3a59c95762..b4252eb0444 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -187,14 +187,14 @@ struct LaunchLRN {
     const int cols = static_cast(in.dim_size(2));
     const int depth = static_cast(in.dim_size(3));
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
@@ -404,14 +404,14 @@ struct LaunchLRNGrad {
     const int64 cols = in_grads.dim_size(2);
     const int64 depth = in_grads.dim_size(3);
 
-    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    se::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
         .set_height(rows)
         .set_width(cols)
         .set_feature_map_count(depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+        .set_layout(se::dnn::DataLayout::kBatchYXDepth);
 
-    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    se::dnn::NormalizeDescriptor normalize_desc;
     normalize_desc.set_bias(bias_)
         .set_range(depth_radius_)
         .set_alpha(alpha_)
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index f499ce6519d..3664f95c3b1 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -112,7 +112,7 @@ bool ExplicitVectorMatrixOptimization(
 template 
 struct LaunchMatMulBase {
 #if GOOGLE_CUDA
-  typedef perftools::gputools::blas::AlgorithmType AlgorithmType;
+  typedef se::blas::AlgorithmType AlgorithmType;
 #else
   typedef int64 AlgorithmType;
 #endif  // GOOGLE_CUDA
@@ -160,15 +160,12 @@ namespace {
 
 template 
 struct LaunchBlasGemv {
-  static void Compute(
-      OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-      uint64 m, uint64 n, const perftools::gputools::DeviceMemory& a,
-      const perftools::gputools::DeviceMemory& b,
-      perftools::gputools::DeviceMemory* c,
-      perftools::gputools::blas::ProfileResult* output_profile) {
-    const auto blas_trans =
-        trans ? perftools::gputools::blas::Transpose::kTranspose
-              : perftools::gputools::blas::Transpose::kNoTranspose;
+  static void Compute(OpKernelContext* ctx, se::Stream* stream, bool trans,
+                      uint64 m, uint64 n, const se::DeviceMemory& a,
+                      const se::DeviceMemory& b, se::DeviceMemory* c,
+                      se::blas::ProfileResult* output_profile) {
+    const auto blas_trans = trans ? se::blas::Transpose::kTranspose
+                                  : se::blas::Transpose::kNoTranspose;
     if (output_profile == nullptr) {
       bool blas_launch_status =
           stream
@@ -198,11 +195,10 @@ struct LaunchBlasGemv {
 
 template <>
 void LaunchBlasGemv::Compute(
-    OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
-    uint64 m, uint64 n, const perftools::gputools::DeviceMemory& a,
-    const perftools::gputools::DeviceMemory& b,
-    perftools::gputools::DeviceMemory* c,
-    perftools::gputools::blas::ProfileResult* output_profile) {
+    OpKernelContext* ctx, se::Stream* stream, bool trans, uint64 m, uint64 n,
+    const se::DeviceMemory& a,
+    const se::DeviceMemory& b, se::DeviceMemory* c,
+    se::blas::ProfileResult* output_profile) {
   ctx->SetStatus(errors::Internal(
       "Blas GEMV launch failed: GEMV is not implemented for float16."));
 }
@@ -219,10 +215,9 @@ bool ShouldUseGemv(uint64 n) {
 
 }  // namespace
 
-bool GetCublasAutotuneComputationType(
-    const DataType& dtype,
-    perftools::gputools::blas::ComputationType* compute_type) {
-  using perftools::gputools::blas::ComputationType;
+bool GetCublasAutotuneComputationType(const DataType& dtype,
+                                      se::blas::ComputationType* compute_type) {
+  using se::blas::ComputationType;
   bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
   switch (dtype) {
     case DT_HALF:
@@ -250,7 +245,7 @@ struct MatmulAutoTuneGroup {
   static string name() { return "Matmul"; }
 };
 typedef AutoTuneSingleton
+                          se::blas::AlgorithmConfig>
     AutoTuneMatmul;
 
 template 
@@ -259,14 +254,14 @@ struct LaunchMatMul {
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array, 1>& dim_pair,
       std::vector* algorithms, bool use_autotune, Tensor* out) {
-    using perftools::gputools::blas::AlgorithmConfig;
-    using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::kDefaultAlgorithm;
-    using perftools::gputools::blas::kDefaultBlasGemm;
-    using perftools::gputools::blas::kDefaultBlasGemv;
-    using perftools::gputools::blas::kNoAlgorithm;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
+    using se::blas::AlgorithmConfig;
+    using se::blas::ComputationType;
+    using se::blas::kDefaultAlgorithm;
+    using se::blas::kDefaultBlasGemm;
+    using se::blas::kDefaultBlasGemv;
+    using se::blas::kNoAlgorithm;
+    using se::blas::ProfileResult;
+    using se::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 6f7e6a74968..5de0d1118af 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -34,11 +34,9 @@ namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template 
-perftools::gputools::DeviceMemory AsDeviceMemory(
-    const Scalar* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(
-      const_cast(cuda_memory));
-  perftools::gputools::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const Scalar* cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -204,18 +202,17 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp {
     // output' = rhs' / matrix' (' stands for transpose)
     // Upper/lower needs to be swapped for this.
 
-    perftools::gputools::blas::UpperLower upper_lower_matrix;
-    perftools::gputools::blas::Transpose transpose_matrix;
+    se::blas::UpperLower upper_lower_matrix;
+    se::blas::Transpose transpose_matrix;
     if (lower_) {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
+      upper_lower_matrix = se::blas::UpperLower::kUpper;
     } else {
-      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
+      upper_lower_matrix = se::blas::UpperLower::kLower;
     }
     if (adjoint_) {
-      transpose_matrix =
-          perftools::gputools::blas::Transpose::kConjugateTranspose;
+      transpose_matrix = se::blas::Transpose::kConjugateTranspose;
     } else {
-      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
+      transpose_matrix = se::blas::Transpose::kNoTranspose;
     }
     uint64 leading_dim_matrix = matrix.cols();
     uint64 leading_dim_output = output.cols();
@@ -224,11 +221,11 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp {
     bool blas_launch_status =
         stream
             ->ThenBlasTrsm(
-                perftools::gputools::blas::Side::kRight /*side*/,
-                upper_lower_matrix /*uplo*/, transpose_matrix /*trans*/,
-                perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
-                colmajor_rows /*m*/, colmajor_cols /*n*/, Scalar(1.0) /*alpha*/,
-                matrix_ptr, leading_dim_matrix /*lda*/, &out_ptr,
+                se::blas::Side::kRight /*side*/, upper_lower_matrix /*uplo*/,
+                transpose_matrix /*trans*/,
+                se::blas::Diagonal::kNonUnit /*diag*/, colmajor_rows /*m*/,
+                colmajor_cols /*n*/, Scalar(1.0) /*alpha*/, matrix_ptr,
+                leading_dim_matrix /*lda*/, &out_ptr,
                 leading_dim_output /*ldb*/)
             .ok();
     if (!blas_launch_status) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index aaaf45d3e78..507fc998377 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -404,10 +404,10 @@ class MaxPoolingGradOp : public OpKernel {
                     "Pooling is not yet supported on the batch dimension."));
 
     if (use_dnn_) {
-      DnnPoolingGradOp::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
-          stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
-          output_shape, propagate_nans_);
+      DnnPoolingGradOp::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   ksize, stride, padding_, data_format_,
+                                   &tensor_in, &tensor_out, out_backprop,
+                                   output_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
@@ -1136,10 +1136,9 @@ class MaxPoolingNoMaskOp : public OpKernel {
 
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize_, stride_, padding_, data_format_,
-                               tensor_in, out_shape, propagate_nans_);
+      DnnPoolingOp::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1240,9 +1239,8 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp::Compute(context,
-                               perftools::gputools::dnn::PoolingMode::kMaximum,
-                               ksize, stride, padding_, data_format_, tensor_in,
+      DnnPoolingOp::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                               stride, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 01bcfede1e8..2180c4eb977 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -748,9 +748,8 @@ struct LaunchPoolingOp {
                      const std::array& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp::Compute(context, se::dnn::PoolingMode::kAverage, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -762,9 +761,8 @@ struct LaunchPoolingOp {
                      const std::array& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
-    DnnPooling3dOp::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, data_format, tensor_in, output);
+    DnnPooling3dOp::Compute(context, se::dnn::PoolingMode::kMaximum, window,
+                               stride, padding, data_format, tensor_in, output);
   }
 };
 
@@ -778,10 +776,10 @@ struct LaunchMaxPooling3dGradOp {
                      const std::array& padding,
                      TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
-    DnnPooling3dGradOp::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kMaximum, window,
-        stride, padding, out, data_format, out_backprop, output_shape,
-        &tensor_in, &tensor_out, input_backprop);
+    DnnPooling3dGradOp::Compute(context, se::dnn::PoolingMode::kMaximum,
+                                   window, stride, padding, out, data_format,
+                                   out_backprop, output_shape, &tensor_in,
+                                   &tensor_out, input_backprop);
   }
 };
 
@@ -796,9 +794,8 @@ struct LaunchAvgPooling3dGradOp {
                      const std::array& padding,
                      TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp::Compute(
-        context, perftools::gputools::dnn::PoolingMode::kAverage, window,
-        stride, padding, out, data_format, out_backprop, tensor_in_shape,
-        nullptr, nullptr, output);
+        context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
+        data_format, out_backprop, tensor_in_shape, nullptr, nullptr, output);
   }
 };
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index d4241b58090..e583f7feb4d 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -114,11 +114,9 @@ TensorShape PoolParameters::forward_output_shape() {
 
 namespace {
 template 
-perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory,
-                                                    uint64 size) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory),
-                                                size * sizeof(T));
-  perftools::gputools::DeviceMemory typed(wrapped);
+se::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) {
+  se::DeviceMemoryBase wrapped(const_cast(cuda_memory), size * sizeof(T));
+  se::DeviceMemory typed(wrapped);
   return typed;
 }
 }  // namespace
@@ -138,12 +136,13 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
 }  // namespace functor
 
 template 
-void DnnPoolingOp::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
-    const std::vector& size, const std::vector& stride,
-    Padding padding, TensorFormat data_format, const Tensor& tensor_in,
-    const TensorShape& tensor_out_shape, bool propagate_nans) {
+void DnnPoolingOp::Compute(OpKernelContext* context,
+                              se::dnn::PoolingMode pooling_mode,
+                              const std::vector& size,
+                              const std::vector& stride, Padding padding,
+                              TensorFormat data_format, const Tensor& tensor_in,
+                              const TensorShape& tensor_out_shape,
+                              bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -184,7 +183,7 @@ void DnnPoolingOp::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -194,19 +193,19 @@ void DnnPoolingOp::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor input_desc;
+  se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor output_desc;
+  se::dnn::BatchDescriptor output_desc;
   output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto input_data = AsDeviceMemory(transformed_input.template flat().data(),
                                    transformed_input.template flat().size());
@@ -236,13 +235,12 @@ void DnnPoolingOp::Compute(
 
 template 
 void DnnPoolingGradOp::Compute(
-    OpKernelContext* context,
-    perftools::gputools::dnn::PoolingMode pooling_mode,
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::vector& size, const std::vector& stride,
     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
     const Tensor* tensor_out, const Tensor& out_backprop,
     const TensorShape& tensor_in_shape, bool propagate_nans) {
-  CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
+  CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
@@ -327,7 +325,7 @@ void DnnPoolingGradOp::Compute(
   }
 
   /// Get ready to call cudnn
-  perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+  se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
@@ -337,19 +335,19 @@ void DnnPoolingGradOp::Compute(
       .set_horizontal_padding(params.pad_cols)
       .set_propagate_nans(propagate_nans);
 
-  perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+  se::dnn::BatchDescriptor orig_output_desc;
   orig_output_desc.set_count(params.tensor_in_batch)
       .set_height(params.out_height)
       .set_width(params.out_width)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
-  perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+  se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
       .set_feature_map_count(params.depth)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
 
   auto orig_output_data =
       AsDeviceMemory(transformed_output.template flat().data(),
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 14584565857..7362c5275f7 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -40,7 +40,7 @@ class DnnPoolingOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector& size,
                       const std::vector& stride, Padding padding,
                       TensorFormat data_format, const Tensor& tensor_in,
@@ -55,7 +55,7 @@ class DnnPoolingGradOp {
  public:
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
-                      perftools::gputools::dnn::PoolingMode pooling_mode,
+                      se::dnn::PoolingMode pooling_mode,
                       const std::vector& size,
                       const std::vector& stride, Padding padding,
                       TensorFormat data_format, const Tensor* tensor_in,
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2fc73a3309d..c87ce78e051 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -242,7 +242,7 @@ class SegmentSumGPUOp : public AsyncOpKernel {
       return;
     }
 
-    perftools::gputools::DeviceMemoryBase output_rows_device(
+    se::DeviceMemoryBase output_rows_device(
         const_cast(segment_ids).template flat().data() +
         (num_indices - 1));
     ScratchSpace output_rows_host(context, 1, /* on_host */ true);
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index f92c4ed17af..3330442ffd6 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cuda_solvers.h"
 #include "tensorflow/core/platform/cuda.h"
 
-using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -278,8 +278,7 @@ class WhereGPUOp : public AsyncOpKernel {
 
     auto num_true_t = num_true.scalar();
 
-    perftools::gputools::DeviceMemoryBase num_true_ptr(
-        static_cast(num_true_t.data()));
+    se::DeviceMemoryBase num_true_ptr(static_cast(num_true_t.data()));
     // Push kernel to stream to get number of true elements.
     const GPUDevice& d = context->eigen_device();
     Status s = functor::NumTrue::Compute(
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 580db4844f2..7ac5e5c4450 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,27 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                               \
-  struct DynLoadShim__##__name {                                            \
-    static const char* kName;                                               \
-    using FuncPointerT = std::add_pointer::type;        \
-    static void* GetDsoHandle() {                                           \
-      static auto status = perftools::gputools::internal::CachedDsoLoader:: \
-          GetLibcuptiDsoHandle();                                           \
-      return status.ValueOrDie();                                           \
-    }                                                                       \
-    static FuncPointerT DynLoad() {                                         \
-      static void* f;                                                       \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(       \
-          GetDsoHandle(), kName, &f))                                       \
-          << "could not find " << kName << "in libcupti DSO";               \
-      return reinterpret_cast(f);                             \
-    }                                                                       \
-    template                                              \
-    CUptiResult operator()(Args... args) {                                  \
-      return DynLoad()(args...);                                            \
-    }                                                                       \
-  } __name;                                                                 \
+#define LIBCUPTI_WRAP(__name)                                                 \
+  struct DynLoadShim__##__name {                                              \
+    static const char* kName;                                                 \
+    using FuncPointerT = std::add_pointer::type;          \
+    static void* GetDsoHandle() {                                             \
+      static auto status =                                                    \
+          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
+      return status.ValueOrDie();                                             \
+    }                                                                         \
+    static FuncPointerT DynLoad() {                                           \
+      static void* f;                                                         \
+      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
+          GetDsoHandle(), kName, &f))                                         \
+          << "could not find " << kName << "in libcupti DSO";                 \
+      return reinterpret_cast(f);                               \
+    }                                                                         \
+    template                                                \
+    CUptiResult operator()(Args... args) {                                    \
+      return DynLoad()(args...);                                              \
+    }                                                                         \
+  } __name;                                                                   \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index f2471712cca..68897ac423f 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -63,9 +63,7 @@ typedef uint64 Fprint;
 // Alias namespace ::stream_executor as ::tensorflow::se.
 namespace stream_executor {}
 namespace tensorflow {
-// TODO(b/77980417): Uncomment this once all namespace aliases named 'se' are
-// removed in ::xla.
-// namespace se = ::stream_executor;
+namespace se = ::stream_executor;
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_TYPES_H_

From 7bee86727b87a8317d4f1407061edfa9ccb16ea5 Mon Sep 17 00:00:00 2001
From: Igor Ganichev 
Date: Mon, 23 Apr 2018 19:35:12 -0700
Subject: [PATCH 0642/1734] Don't Ref() XlaDeviceContext unnecessarily.

PiperOrigin-RevId: 194024407
---
 tensorflow/compiler/jit/xla_device.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 7beb18c04d6..3e27cd39c62 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -234,7 +234,6 @@ Status XlaDevice::CreateAndSetGpuDeviceInfo() {
     gpu_device_info_->stream = stream;
     gpu_device_info_->default_context =
         new XlaDeviceContext(stream, client(), transfer_as_literal_);
-    gpu_device_info_->default_context->Ref();
     set_tensorflow_gpu_device_info(gpu_device_info_.get());
   }
 

From 3f7c9265b59cae306d029dfac76e25badd20def8 Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang 
Date: Mon, 23 Apr 2018 19:35:19 -0700
Subject: [PATCH 0643/1734] Add missing pmf_to_cdf_op.cc in the source list in
 cmake.

Also split range_coder_ops.cc and range_coder_ops_util.cc into separate targets
so that dependence to range_coder_ops_util.cc does not register kernels again.

PiperOrigin-RevId: 194024410
---
 tensorflow/contrib/coder/BUILD | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index a146460a9cd..a2c6e413039 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -54,19 +54,27 @@ tf_gen_op_libs(
     ],
 )
 
+cc_library(
+    name = "range_coder_ops_util",
+    srcs = ["kernels/range_coder_ops_util.cc"],
+    hdrs = ["kernels/range_coder_ops_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "range_coder_ops",
     srcs = [
         "kernels/range_coder_ops.cc",
-        "kernels/range_coder_ops_util.cc",
-    ],
-    hdrs = [
-        "kernels/range_coder_ops_util.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
         ":coder_ops_op_lib",
         ":range_coder",
+        ":range_coder_ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],

From 24b7c9a800ab5086d45a7d83ebcd6218424dc9e3 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou 
Date: Mon, 23 Apr 2018 20:15:30 -0700
Subject: [PATCH 0644/1734] Make all_reduce._split_by_task function able to
 deal with different jobs.

PiperOrigin-RevId: 194027134
---
 .../contrib/all_reduce/python/all_reduce.py   | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 8add2aacff1..159d985db5c 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,10 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import math
-import re
 
 from tensorflow.contrib import nccl
+from tensorflow.python.framework import device as device_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -659,21 +660,20 @@ def _split_by_task(devices, values):
   num_devices = len(devices)
   if num_devices != len(values):
     raise ValueError("len(devices) must equal len(values)")
-  pattern = re.compile(r"/task:(\d+)/")
-  per_task_devices = []
-  per_task_values = []
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
   for d in range(num_devices):
-    m = pattern.search(devices[d])
-    if m:
-      index = int(m.group(1))
-      while index >= len(per_task_devices):
-        per_task_devices.append([])
-        per_task_values.append([])
-      per_task_devices[index].append(devices[d])
-      per_task_values[index].append(values[d])
-    else:
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
       assert False, "failed to parse device %s" % devices[d]
-  return (per_task_devices, per_task_values)
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
 
 
 def build_nccl_all_reduce(input_tensors, red_op, un_op=None):

From 22f3a97b8b089202f60bb0c7697feb0c8e0713cc Mon Sep 17 00:00:00 2001
From: Yifei Feng 
Date: Mon, 23 Apr 2018 21:19:14 -0700
Subject: [PATCH 0645/1734] Merge changes from github.

PiperOrigin-RevId: 194031845
---
 CODEOWNERS                                    |   2 +-
 README.md                                     |   2 +-
 RELEASE.md                                    |  58 +++
 WORKSPACE                                     |   8 +-
 tensorflow/c/c_api.h                          |   4 +-
 tensorflow/c/c_api_experimental.cc            |  12 +
 tensorflow/c/c_api_experimental.h             |   4 +-
 tensorflow/c/eager/c_api.h                    |   4 +-
 tensorflow/compiler/aot/runtime.cc            |   4 +-
 tensorflow/compiler/tests/binary_ops_test.py  |  12 +-
 .../compiler/xla/python/xla_client_test.py    |   1 -
 .../gpu/cudnn_convolution_algorithm_picker.cc |   4 +-
 .../compiler/xla/tests/dot_operation_test.cc  |   7 +
 .../autograph/converters/call_trees.py        |   2 +-
 .../autograph/converters/call_trees_test.py   |   2 +-
 .../autograph/converters/decorators_test.py   |   2 +-
 tensorflow/contrib/autograph/impl/api.py      |   4 +-
 .../contrib/autograph/impl/conversion.py      |   2 +-
 .../pyct/static_analysis/activity.py          |   6 +-
 .../pyct/static_analysis/activity_test.py     |   2 +-
 .../autograph/pyct/static_analysis/annos.py   |   8 +-
 .../contrib/autograph/utils/builtins.py       |   2 +-
 .../bayesflow/python/ops/monte_carlo_impl.py  |  26 +-
 .../training/functions/gbdt_batch_test.py     |   2 +-
 .../python/split_dependency_test.py           |   2 +-
 tensorflow/contrib/cmake/CMakeLists.txt       |  70 +++-
 tensorflow/contrib/cmake/README.md            |  28 ++
 .../contrib/cmake/external/gemmlowp.cmake     |   4 +-
 .../contrib/cmake/external/mkldnn.cmake       |  44 +++
 tensorflow/contrib/cmake/external/png.cmake   |  19 +-
 .../contrib/cmake/external/sqlite.cmake       |   4 +-
 .../contrib/cmake/tf_core_framework.cmake     |   8 +-
 tensorflow/contrib/cmake/tf_python.cmake      |   9 +-
 tensorflow/contrib/cmake/tf_shared_lib.cmake  |   3 +-
 .../contrib/cmake/tf_stream_executor.cmake    |   6 +
 .../contrib/cmake/tools/create_def_file.py    |   8 +-
 .../crf/python/kernel_tests/crf_test.py       |  15 +
 tensorflow/contrib/crf/python/ops/crf.py      |   8 +-
 .../cudnn_rnn/python/layers/cudnn_rnn.py      |   3 +-
 .../contrib/data/python/kernel_tests/BUILD    |   7 +-
 .../dataset_serialization_test_base.py        |   2 +-
 .../interleave_dataset_op_test.py             |  63 ++--
 .../kernel_tests/stats_dataset_ops_test.py    |  16 +
 .../contrib/data/python/ops/interleave_ops.py |  26 +-
 .../data/python/ops/prefetching_ops.py        |   6 +-
 .../contrib/data/python/ops/scan_ops.py       |   2 +-
 .../python/kernel_tests/shape_test.py         |   1 -
 tensorflow/contrib/eager/python/saver_test.py |   1 -
 .../estimator/python/estimator/head.py        |   2 +-
 .../python/estimator/replicate_model_fn.py    |   4 +-
 .../factorization/python/ops/gmm_ops.py       |  12 +-
 .../factorization/python/ops/kmeans.py        |   4 +-
 tensorflow/contrib/framework/__init__.py      |   3 +-
 .../python/framework/tensor_util_test.py      |   2 +-
 .../ops/fused_conv2d_bias_activation_op.py    |   2 +-
 .../fused_conv2d_bias_activation_op_test.py   |  10 +-
 .../eval/python/sliced_wasserstein_impl.py    |   2 +-
 .../features/python/virtual_batchnorm_impl.py |   6 +-
 tensorflow/contrib/hvx/README.md              |   3 +-
 .../kernels/adjust_hsv_in_yiq_op_gpu.cu.cc    |   2 +-
 .../contrib/image/ops/distort_image_ops.cc    |   4 +-
 tensorflow/contrib/image/ops/image_ops.cc     |   2 +-
 ...single_image_random_dot_stereograms_ops.cc |   4 +-
 .../contrib/image/python/ops/image_ops.py     |   2 +-
 .../single_image_random_dot_stereograms.py    |   2 +-
 .../contrib/kfac/python/ops/loss_functions.py |   6 +-
 .../kfac/python/ops/loss_functions_lib.py     |   1 -
 .../labeled_tensor/python/ops/ops_test.py     |   4 +-
 .../sparse_feature_cross_op_test.py           |   2 +-
 .../layers/python/layers/feature_column.py    |   2 +-
 .../python/layers/feature_column_ops.py       |   4 +-
 .../contrib/layers/python/layers/layers.py    | 142 ++++++-
 .../layers/python/layers/layers_test.py       |  15 +-
 .../python/layers/rev_block_lib_test.py       |   4 +-
 .../layers/python/layers/utils_test.py        |   1 -
 .../python/learn/estimators/kmeans_test.py    |   1 -
 .../python/learn/estimators/run_config.py     |   1 +
 tensorflow/contrib/lite/Makefile              |   3 +-
 .../contrib/lite/download_dependencies.sh     |   6 +-
 .../project.pbxproj                           |   8 -
 tensorflow/contrib/lite/g3doc/apis.md         |   2 +-
 .../Camera2BasicFragment.java                 |  23 ++
 .../tflitecamerademo/ImageClassifier.java     |  10 +
 .../res/layout/fragment_camera2_basic.xml     |  41 ++-
 .../demo/app/src/main/res/values/strings.xml  |   2 +
 .../java/org/tensorflow/lite/Interpreter.java |   7 +
 .../lite/NativeInterpreterWrapper.java        |   6 +
 .../native/nativeinterpreterwrapper_jni.cc    |  10 +
 .../native/nativeinterpreterwrapper_jni.h     |  12 +-
 tensorflow/contrib/lite/kernels/add.cc        |   2 +-
 tensorflow/contrib/lite/kernels/div.cc        |   5 +-
 .../internal/optimized/optimized_ops.h        |   2 +-
 .../internal/reference/reference_ops.h        |  39 +-
 tensorflow/contrib/lite/kernels/sub.cc        |   3 +-
 .../resolve_tensorflow_merge.cc               |   2 +-
 tensorflow/contrib/lite/toco/model.h          |   6 +-
 .../contrib/losses/python/losses/loss_ops.py  |   9 +-
 .../python/metric_learning/metric_loss_ops.py |   4 +-
 .../contrib/makefile/download_dependencies.sh |   4 +-
 .../meta_graph_transform.py                   |   2 +-
 .../contrib/metrics/python/ops/metric_ops.py  |  15 +-
 .../contrib/nn/python/ops/sampling_ops.py     |   2 +-
 tensorflow/contrib/opt/BUILD                  |  17 +
 tensorflow/contrib/opt/__init__.py            |   2 +
 .../contrib/opt/python/training/adamax.py     | 191 ++++++++++
 .../opt/python/training/adamax_test.py        | 348 ++++++++++++++++++
 .../training/moving_average_optimizer_test.py |   4 +-
 .../optimizer_v2/checkpointable_utils_test.py |   2 +-
 .../contrib/optimizer_v2/optimizer_v2.py      |   2 +-
 .../quantize/python/fold_batch_norms.py       |   2 +-
 .../kernel_tests/attention_wrapper_test.py    | 112 +++++-
 .../seq2seq/python/ops/attention_wrapper.py   |  38 +-
 .../python/kernel_tests/mel_ops_test.py       |  13 +
 .../contrib/signal/python/ops/mel_ops.py      |  16 +-
 tensorflow/contrib/slim/README.md             |   8 +-
 .../contrib/slim/python/slim/learning.py      |   5 +-
 .../slim/python/slim/nets/resnet_v1.py        |   2 +-
 .../slim/python/slim/nets/resnet_v2.py        |   2 +-
 .../tensor_forest/client/random_forest.py     |   2 +-
 .../core/ops/hard_routing_function_op.cc      |   2 +-
 .../stochastic_hard_routing_function_op.cc    |   2 +-
 .../stochastic_hard_routing_gradient_op.cc    |   2 +-
 .../tensor_forest/kernels/tree_utils.cc       |   4 +-
 .../tensor_forest/kernels/tree_utils.h        |   2 +-
 .../kernels/v4/decision-tree-resource.h       |   2 +-
 .../kernels/v4/decision_node_evaluator.h      |   2 +-
 .../contrib/tensor_forest/ops/model_ops.cc    |   2 +-
 .../contrib/tensor_forest/ops/stats_ops.cc    |   6 +-
 .../tensor_forest/python/tensor_forest.py     |   2 +-
 tensorflow/contrib/tensorrt/BUILD             |  21 +-
 tensorflow/contrib/tensorrt/README.md         |  60 +--
 .../resources/trt_resource_manager.cc         |   6 +
 .../tensorrt/resources/trt_resource_manager.h |   6 +-
 .../tensorrt/test/tf_trt_integration_test.py  | 156 ++++++++
 .../python/timeseries/math_utils.py           |   2 +-
 .../training/python/training/resample.py      |   2 +-
 .../training/python/training/sampling_ops.py  |   6 +-
 .../training/sequence_queueing_state_saver.py |   4 +-
 tensorflow/core/BUILD                         |  16 +
 .../base_api/api_def_ApplyAdaMax.pbtxt        |  78 ++++
 .../base_api/api_def_BroadcastTo.pbtxt        |  41 +++
 .../base_api/api_def_ImageSummary.pbtxt       |   2 +-
 .../api_def_ResourceApplyAdaMax.pbtxt         |  72 ++++
 .../base_api/api_def_StringStrip.pbtxt        |  16 +
 .../python_api/api_def_ApplyAdaMax.pbtxt      |   4 +
 .../python_api/api_def_BroadcastTo.pbtxt      |   4 +
 .../api_def_ResourceApplyAdaMax.pbtxt         |   4 +
 .../core/common_runtime/bfc_allocator.h       |   2 +-
 .../core/common_runtime/mkl_cpu_allocator.h   |   4 +
 tensorflow/core/framework/collective.h        |   2 +-
 tensorflow/core/framework/numeric_types.h     |   4 +-
 tensorflow/core/graph/mkl_tfconversion_pass.h |   4 +
 .../grappler/clusters/single_machine_test.cc  |   9 +
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../optimizers/custom_graph_optimizer.h       |   4 +-
 .../custom_graph_optimizer_registry_test.cc   |   5 +-
 .../optimizers/meta_optimizer_test.cc         |   5 +-
 tensorflow/core/kernels/BUILD                 |  50 +--
 .../batching_util/shared_batch_scheduler.h    |   6 +-
 tensorflow/core/kernels/broadcast_to_op.cc    |  91 +++++
 tensorflow/core/kernels/broadcast_to_op.h     | 220 +++++++++++
 .../core/kernels/broadcast_to_op_gpu.cu.cc    |  34 ++
 tensorflow/core/kernels/conv_ops_gpu.h        |   5 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  34 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  35 +-
 tensorflow/core/kernels/mkl_relu_op.cc        |   8 +-
 tensorflow/core/kernels/roll_op.cc            |   7 +-
 .../core/kernels/segment_reduction_ops.h      |   8 +
 tensorflow/core/kernels/string_strip_op.cc    |  53 +++
 tensorflow/core/kernels/training_ops.cc       | 150 ++++++++
 tensorflow/core/kernels/training_ops.h        |  12 +
 .../core/kernels/training_ops_gpu.cu.cc       |  30 ++
 tensorflow/core/lib/bfloat16/bfloat16.h       |   4 +-
 tensorflow/core/lib/gtl/manual_constructor.h  |   2 +-
 tensorflow/core/lib/strings/stringprintf.cc   |  10 +-
 .../core/lib/strings/stringprintf_test.cc     |   4 +-
 tensorflow/core/ops/array_ops.cc              |  52 +++
 tensorflow/core/ops/dataset_ops.cc            | 140 ++++++-
 tensorflow/core/ops/manip_ops.cc              |  13 +-
 tensorflow/core/ops/nn_ops.cc                 |   6 +
 tensorflow/core/ops/random_ops.cc             |   7 +-
 tensorflow/core/ops/string_ops.cc             |   5 +
 tensorflow/core/ops/training_ops.cc           |  51 +++
 tensorflow/core/platform/default/logging.cc   |   1 +
 .../platform/hadoop/hadoop_file_system.cc     |   2 +
 .../core/protobuf/rewriter_config.proto       |  11 +
 tensorflow/core/public/version.h              |   4 +-
 tensorflow/core/util/memmapped_file_system.cc |   2 +-
 tensorflow/core/util/memmapped_file_system.h  |   4 +-
 tensorflow/core/util/mkl_util.h               |   4 +
 .../python/contrib.bayesflow.monte_carlo.md   |  28 +-
 .../docs_src/community/documentation.md       |  50 +--
 tensorflow/docs_src/deploy/s3.md              |  81 +++-
 .../docs_src/extend/language_bindings.md      |   9 +-
 tensorflow/docs_src/install/install_c.md      |   2 +-
 tensorflow/docs_src/install/install_go.md     |   2 +-
 tensorflow/docs_src/install/install_java.md   |  24 +-
 tensorflow/docs_src/install/install_linux.md  |  58 ++-
 tensorflow/docs_src/install/install_mac.md    |  10 +-
 .../docs_src/install/install_sources.md       |   9 +-
 tensorflow/docs_src/mobile/android_build.md   |   3 +-
 .../docs_src/performance/quantization.md      |   2 +-
 .../docs_src/programmers_guide/debugger.md    |   2 +-
 .../docs_src/programmers_guide/graphs.md      |   6 +-
 .../docs_src/programmers_guide/saved_model.md |  50 +--
 .../docs_src/programmers_guide/using_tpu.md   |   4 +-
 .../docs_src/tutorials/audio_recognition.md   |   2 +-
 tensorflow/docs_src/tutorials/layers.md       |  17 +-
 .../tutorials/word2vec/word2vec_basic.py      |   2 +-
 tensorflow/go/op/wrappers.go                  |   2 +-
 .../org/tensorflow/examples/LabelImage.java   |   2 +
 tensorflow/python/BUILD                       |  19 +-
 tensorflow/python/debug/cli/readline_ui.py    |   8 +-
 .../python/debug/wrappers/grpc_wrapper.py     |  11 +-
 tensorflow/python/debug/wrappers/hooks.py     |  17 +-
 tensorflow/python/estimator/canned/head.py    |   9 +-
 tensorflow/python/estimator/estimator.py      |   5 +-
 tensorflow/python/estimator/run_config.py     |  33 +-
 .../python/estimator/run_config_test.py       |  24 +-
 .../python/feature_column/feature_column.py   |   1 -
 tensorflow/python/framework/dtypes.py         |  14 +-
 .../python/framework/graph_util_impl.py       |   2 +-
 .../python/framework/graph_util_test.py       |   2 +-
 tensorflow/python/framework/load_library.py   |   2 +-
 tensorflow/python/framework/python_op_gen.i   |   8 +-
 tensorflow/python/framework/test_util.py      |   2 +
 .../python/grappler/layout_optimizer_test.py  |  10 +-
 .../python/keras/_impl/keras/backend.py       |   4 +-
 .../keras/_impl/keras/layers/normalization.py |   4 +-
 tensorflow/python/kernel_tests/BUILD          |  26 ++
 .../kernel_tests/broadcast_to_ops_test.py     |  85 +++++
 .../kernel_tests/confusion_matrix_test.py     |   7 +-
 .../python/kernel_tests/constant_op_test.py   |   5 +
 .../kernel_tests/conv3d_transpose_test.py     |  12 +
 .../python/kernel_tests/manip_ops_test.py     |  55 ++-
 .../python/kernel_tests/norm_op_test.py       |  16 +-
 .../python/kernel_tests/py_func_test.py       |  32 ++
 .../random/multinomial_op_test.py             |   2 +-
 .../kernel_tests/random/random_ops_test.py    |  11 +
 .../kernel_tests/string_strip_op_test.py      |  56 +++
 tensorflow/python/lib/core/py_func.cc         |   3 +
 tensorflow/python/ops/array_ops.py            |  15 +-
 .../python/ops/distributions/categorical.py   |   2 +-
 tensorflow/python/ops/embedding_ops.py        |  26 +-
 tensorflow/python/ops/histogram_ops.py        |   1 -
 tensorflow/python/ops/image_ops_impl.py       |  74 ++--
 tensorflow/python/ops/init_ops.py             |  18 +-
 tensorflow/python/ops/linalg_ops.py           |  77 ++--
 tensorflow/python/ops/linalg_ops_impl.py      |  73 ++++
 tensorflow/python/ops/losses/losses_impl.py   |  23 +-
 tensorflow/python/ops/math_ops.py             |  38 +-
 tensorflow/python/ops/nn.py                   |   1 +
 tensorflow/python/ops/nn_impl.py              |  11 +-
 tensorflow/python/ops/nn_ops.py               |   8 +-
 tensorflow/python/ops/rnn_cell_impl.py        |   4 +-
 .../python/profiler/tfprof_logger_test.py     |   2 +-
 tensorflow/python/tools/saved_model_cli.py    |   3 +-
 tensorflow/python/training/saver_test.py      |   2 +-
 tensorflow/python/util/compat.py              |   7 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |   7 +-
 tensorflow/stream_executor/cuda/cuda_dnn.h    |   2 +-
 .../stream_executor/cuda/cuda_driver.cc       |  14 +-
 .../stream_executor/cuda/cuda_gpu_executor.cc |   2 +-
 tensorflow/stream_executor/dnn.h              |  20 +-
 tensorflow/stream_executor/platform/port.h    |   6 -
 tensorflow/tensorflow.bzl                     |   3 +-
 .../tensorflow.estimator.-run-config.pbtxt    |   6 +-
 tensorflow/tools/api/golden/tensorflow.pbtxt  |   4 +
 tensorflow/tools/ci_build/builds/pip.sh       |   4 +
 .../tools/ci_build/builds/test_user_ops.sh    |  39 +-
 .../tools/ci_build/linux/cpu/run_mkl.sh       |   5 +-
 .../ci_build/windows/gpu/cmake/run_py.bat     |   6 +-
 tensorflow/tools/docker/Dockerfile.devel      |   2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |   2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |   2 +-
 tensorflow/tools/git/gen_git_source.py        |  56 ++-
 tensorflow/tools/git/gen_git_source.sh        |  10 +-
 .../tools/graph_transforms/transform_graph.cc |  70 +++-
 tensorflow/tools/pip_package/setup.py         |   2 +-
 tensorflow/workspace.bzl                      |   9 +-
 third_party/repo.bzl                          |   3 +-
 281 files changed, 4022 insertions(+), 893 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/external/mkldnn.cmake
 create mode 100644 tensorflow/contrib/opt/python/training/adamax.py
 create mode 100644 tensorflow/contrib/opt/python/training/adamax_test.py
 create mode 100644 tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringStrip.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceApplyAdaMax.pbtxt
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.cc
 create mode 100644 tensorflow/core/kernels/broadcast_to_op.h
 create mode 100644 tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
 create mode 100644 tensorflow/core/kernels/string_strip_op.cc
 create mode 100644 tensorflow/python/kernel_tests/broadcast_to_ops_test.py
 create mode 100644 tensorflow/python/kernel_tests/string_strip_op_test.py
 create mode 100644 tensorflow/python/ops/linalg_ops_impl.py

diff --git a/CODEOWNERS b/CODEOWNERS
index 007a304c3e7..b9f0313cc6d 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,7 +45,7 @@
 # /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 # /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 # /tensorflow/contrib/stateless/ @girving
-# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
 # /tensorflow/contrib/testing/ @dandelionmane
 # /tensorflow/contrib/timeseries/ @allenlavoie
 # /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
diff --git a/README.md b/README.md
index 29418dc2e97..e1a50c87e26 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
diff --git a/RELEASE.md b/RELEASE.md
index e8459531748..2717c75740a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,61 @@
+# Release 1.8.0
+
+## Major Features And Improvements
+* Can now pass `tf.contrib.distribute.MirroredStrategy()` to `tf.estimator.RunConfig()` to run an Estimator model on multiple GPUs on one machine.
+* Add `tf.contrib.data.prefetch_to_device()`, which supports prefetching to GPU memory.
+* Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
+* Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
+* `tf.contrib.bayesflow` is moving out to it's own repo.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Add `tf.contrib.data.prefetch_to_device`, which enables prefetching dataset elements to GPU memory.
+  * Add `tf.contrib.data.AUTOTUNE`, which allows the tf.data runtime to automatically tune the prefetch buffer sizes based on your system and environment.
+  * Add `tf.contrib.data.make_csv_dataset` for building datasets of CSV files.
+* Eager Execution:
+  * With eager execution Datasets can now be used as standard python iterators (`for batch in dataset:`). Both `Dataset.__iter__()` and `Dataset.make_one_shot_iterator()` can now be used to create iterators when eager execution is enabled.
+  * Automatic device placement has been enabled (i.e., use a GPU if available automatically, without requiring an explicit `with tf.device(“/gpu:0”)`) (Fixes #14133)
+  * `tf.GradientTape` has moved out of contrib.
+* `tf.keras`:
+  * Added the fashion mnist dataset.
+  * New data preprocessing functions: `image/random_brightness`, `sequence/TimeseriesGenerator`, and `text/hashing_trick`.
+* Accelerated Linear Algebra (XLA):
+  * Select and scatter in reference util and evaluator now use lexicographical order to break ties.
+* TensorFlow Debugger (tfdbg) CLI:
+  * During tensor-filter operations, allow exclusion of nodes by regular expressions.
+  * Fix spurious background colors in some text terminals.
+* `tf.contrib`:
+  * Add meta-distribution BatchReshape which reshapes batch dimensions.
+  * `tf.contrib.layers.recompute_grad` works for explicit gradient checkpointing on TPU.
+  * Add `tf.contrib.framework.argsort`.
+  * Allow `DNNBoostedTreeCombinedEstimator` to work with core versions of feature columns and losses.
+  * Add non-linear image warping ops: `tf.contrib.image.sparse_image_warp`, `tf.contrib.image.dense_image_warp`, and `tf.contrib.image.interpolate_spline`.
+  * Fix bug in `tf.contrib.opt.MultitaskOptimizerWrapper` where types of tensors were mismatched.
+* Other:
+  * Low-level graph construction now calls the TensorFlow C API. This change should be invisible to most users, but can be disabled by setting the environment variable `TF_C_API_GRAPH_CONSTRUCTION=0` in this release. Future releases will remove the ability to disable this change. Please [file a bug](https://github.com/tensorflow/tensorflow/issues/new) if you find yourself using this escape hatch.
+  * Add description of shapes and a pointer to tutorial notebook in `tf.distributions.Distribution`.
+  * Update scatter operations:
+    * Add `tf.scatter_min` and `tf.scatter_max`
+    * Extend scatter operations to work with a scalar update parameter.
+  * Move cuDNN RNN ops to core for use in TensorFlow codebase only.
+  * Add `float64` support for `Conv2d`, `Conv2dBackpropInput`, and `Conv2dBackpropFilter`.
+  * Add `float64` support for `AvgPool`/`AvgPoolGrad`.
+  * Make graph name scope thread local so that they work correctly in multi-threaded environments.
+  * Update nsync synchronization library to avoid slow primitives on Linux.
+  * Removed need to put nsync/public on C include path when building custom ops.
+  * Add `tf.image.psnr`, `tf.image.ssim`, `tf.image.ssim_multiscale`, `tf.image.image_gradients`, `tf.image.sobel_edges`.
+  * Add links to https://js.tensorflow.org.
+  * Fix non-uniformity of orthogonal matrices.
+  * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
+
+
 # Release 1.7.0
 
 ## Major Features And Improvements
diff --git a/WORKSPACE b/WORKSPACE
index 11c5cdb2070..4ddfb9a3832 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
-    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
+    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
+    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
     ],
 )
 
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fe85f8ee0ed..c8594347451 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -72,7 +72,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -80,7 +80,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 9678ee926fc..d3916bc1677 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -184,6 +184,7 @@ library {
   return std::move(functions[0]);
 }
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads a Imagenet TFRecordFile dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -7076,7 +7077,9 @@ library {
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 #endif
 }
+#endif
 
+#if not defined(PLATFORM_WINDOWS)
 //  On success, returns a set of TF_Function instances encoding a dataset
 //  node stack that reads an MNIST file dataset from `file_path`, and
 //  sets `dataset_name` to the created dataset name. The returned functions must
@@ -8221,6 +8224,7 @@ library {
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
 #endif
 }
+#endif
 
 // Adds the input functions to `graph`.  On success, returns the created
 // IteratorGetNext node.
@@ -8314,6 +8318,13 @@ TF_Operation* TF_MakeFakeIteratorGetNextWithDatasets(TF_Graph* graph,
 TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     TF_Graph* graph, const char* file_path, int batch_size,
     unsigned char is_mnist, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  // TODO(ashankar): get these functions working on Windows.
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   tensorflow::Status s;
 
   std::string dataset_name;
@@ -8355,4 +8366,5 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
           << graph->graph.ToGraphDefDebug().DebugString();
 
   return getnext_node;
+#endif
 }
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 666342974ee..88cb173cd25 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -35,7 +35,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -43,7 +43,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 15ac0f376c1..ba77f3cd07f 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -30,7 +30,7 @@ limitations under the License.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_CAPI_EXPORT __declspec(dllexport)
 #else
@@ -38,7 +38,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #endif  // SWIG
 
 #ifdef __cplusplus
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 57727766661..5e74079fc15 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,7 +31,7 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
-#elif defined(COMPILER_MSVC)
+#elif defined(_WIN32)
   return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
@@ -48,7 +48,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 }
 
 inline void aligned_free(void* aligned_memory) {
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
   _aligned_free(aligned_memory);
 #else
   free(aligned_memory);
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index d1d7379c0a3..1e4dd32916c 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -360,11 +360,13 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
-    self._testBinary(
-        math_ops.add,
-        np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-        np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-        expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
 
   def testComplexOps(self):
     for dtype in self.complex_types:
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 6fe7b242e42..c073c02040e 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1160,7 +1160,6 @@ class EmbeddedComputationsTest(LocalComputationTest):
       self._ExecuteAndCompareClose(
           c, expected=np.sum(input_array, axis=tuple(dims)))
 
-    _ReduceAndTest(0)
     _ReduceAndTest(0)
     _ReduceAndTest(0, 1)
     _ReduceAndTest(0, 2)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1790c50d4d6..c4c56c56928 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -97,9 +97,9 @@ bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
                                        const ConvolutionDimensionNumbers& dnums,
                                        se::StreamExecutor* stream_exec) {
   // Skip this check for cudnn7 and newer.
-  se::port::StatusOr> version =
+  auto version =
       stream_exec->AsDnn()->GetVersion();
-  if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
     return true;
   }
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 7b994a4c172..c4031dfee59 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -50,6 +50,13 @@ using TypesF16F32 = ::testing::Types;
 using TypesF16F32F64 = ::testing::Types;
 using TypesF16F32F64CF64 =
     ::testing::Types;
+#elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
+using TypesF16F32 = ::testing::Types;
+using TypesF16F32F64 = ::testing::Types;
+using TypesF16F32F64CF64 =
+    ::testing::Types;
 #else
 #error "Situation not handled yet"
 #endif
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 2e5590b46cd..554f0471d44 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -146,7 +146,7 @@ class CallTreeTransformer(transformer.Base):
       # Inspect the target function decorators. If any include a @convert
       # or @graph_ready annotation, then they must be called as they are.
       # TODO(mdan): This may be quite heavy.
-      # To parse and re-analize each function for every call site could be quite
+      # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
         target_node, _ = parser.parse_entity(target_entity)
diff --git a/tensorflow/contrib/autograph/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
index c666dcb73b2..303dd54a4ee 100644
--- a/tensorflow/contrib/autograph/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -34,7 +34,7 @@ class CallTreesTest(converter_test_base.TestCase):
   def test_basic(self):
 
     def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled verison.')
+      raise ValueError('This should not be called in the compiled version.')
 
     def renamed_test_fn_1(a):
       return a + 1
diff --git a/tensorflow/contrib/autograph/converters/decorators_test.py b/tensorflow/contrib/autograph/converters/decorators_test.py
index e67ab1cd6a1..9c01f689127 100644
--- a/tensorflow/contrib/autograph/converters/decorators_test.py
+++ b/tensorflow/contrib/autograph/converters/decorators_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 # The Python parser only briefly captures decorators into the AST.
 # The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is notmally what you would expect, since
+# trace of the decorator (which is normally what you would expect, since
 # they are meant to be transparent).
 # However, decorators are still visible when you analyze the function
 # from inside a decorator, before it was applied - as is the case
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index d874ef15c93..24f87b2c14d 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -49,7 +49,7 @@ def convert(recursive=False, verbose=False, arg_types=None):
   function is called. This means the parameter values are known at compilation.
 
   Args:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_types: See to_graph.
@@ -215,7 +215,7 @@ def to_graph(e,
 
   Args:
     e: A Python entity.
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     verbose: Whether to output the compiled code in the logs.
     arg_values: A dict containing value hints for symbols like function
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index e7230a5f450..55a30dc1279 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -61,7 +61,7 @@ class ConversionMap(object):
   This object is mutable, and is updated as functions are converted.
 
   Attributes:
-    recursive: Whether to recusrively convert any functions that the decorator
+    recursive: Whether to recursively convert any functions that the decorator
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index b81f5c7f87e..2c14c2c8c23 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -162,11 +162,11 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class ActivityAnalizer(transformer.Base):
+class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
   def __init__(self, context, parent_scope):
-    super(ActivityAnalizer, self).__init__(context)
+    super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
@@ -356,4 +356,4 @@ class ActivityAnalizer(transformer.Base):
 
 
 def resolve(node, context, parent_scope=None):
-  return ActivityAnalizer(context, parent_scope).visit(node)
+  return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index d1c4a94b14f..ef79a295bfa 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -108,7 +108,7 @@ class ScopeTest(test.TestCase):
     self.assertFalse(QN('a') in child.referenced)
 
 
-class ActivityAnalizerTest(test.TestCase):
+class ActivityAnalyzerTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn):
     node, source = parser.parse_entity(test_fn)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index d6d9f7e1a60..b929b35b792 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Annotations used by the static analizer."""
+"""Annotations used by the static analyzer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,15 +28,15 @@ class NoValue(Enum):
 
 
 class NodeAnno(NoValue):
-  """Additionnal annotations used by the static analyzer.
+  """Additional annotations used by the static analyzer.
 
   These are in addition to the basic annotations declared in anno.py.
   """
 
   # Symbols
   # These flags are boolean.
-  IS_LOCAL = 'Symbol is local to the function scope being analized.'
-  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
+  IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
   IS_MODIFIED_SINCE_ENTRY = (
       'Symbol has been explicitly replaced in the current function scope.')
 
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index dfc3c86a3de..211e8eaee90 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -77,7 +77,7 @@ def is_tf_print_compatible(value):
 
 
 def dynamic_print(*values):
-  """Implementartion of print using dynamic dispatch.
+  """Implementation of print using dynamic dispatch.
 
   The function attempts to use tf.Print if all the values are compatible.
   Otherwise, it will fall back to py_func.
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index d193a8459d0..032b859d469 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,15 +44,13 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
+  r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\).
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
+  With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns
 
-  ```
   \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
   \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
   \\(=       E_p[f(Z)]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -121,14 +119,12 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\),
   this `Op` returns
 
-  ```
   \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
   \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
   \\(=       Log[E_p[f(Z)]]\\)
-  ```
 
   This integral is done in log-space with max-subtraction to better handle the
   often extreme values that `f(z) p(z) / q(z)` can take on.
@@ -196,13 +192,11 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
+  """Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\).
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
-  ```none
   \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
-  ```
 
   where:
 
@@ -216,8 +210,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
-  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
+  grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where
+  S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\).
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +290,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      \\(E_p[f(X)]\\).  A batch of samples should be indexed by `axis`
       dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
@@ -317,7 +311,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `\\(E_p[f(X)]\\)`.
+      of \\(E_p[f(X)]\\).
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
@@ -329,7 +323,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
     if not callable(f):
       raise ValueError('`f` must be a callable function.')
     if use_reparametrization:
-      return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
     else:
       if not callable(log_prob):
         raise ValueError('`log_prob` must be a callable function.')
@@ -349,7 +343,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
       # "Is there a floating point value of x, for which x-x == 0 is false?"
       # http://stackoverflow.com/q/2686644
       fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
-      return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims)
+      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)
 
 
 def _sample_mean(values):
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 17dcb49f476..f9c22283b7f 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -45,7 +45,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keep_dims=True)
+      math_ops.square(predictions - label), 1, keepdims=True)
   return loss
 
 
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index cb964c80e94..f1d9d19b047 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -73,7 +73,7 @@ class OnlyOneDep(checkpointable.Checkpointable):
 
 class SplitTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testSaveRestoreSplitDep(self):
     save_checkpoint = checkpointable_utils.Checkpoint(
         dep=SaveTensorSlicesAsDeps())
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index bdf3e986351..5f38a8e5c75 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -31,10 +31,14 @@ option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
 option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for contrib packages" OFF)
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
-option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+# SIMD, MKL and MKLDNN options
+option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
+option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
+option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
@@ -124,8 +128,16 @@ endif()
 
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
-  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      # 64 bits
+      add_definitions(-DWIN64)
+  elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      # 32 bits
+      # temporary fix for #18241
+      add_definitions(-DEIGEN_DEFAULT_DENSE_INDEX_TYPE=std::int64_t)
+  endif()
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11)
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
   add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
   add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm-)
@@ -162,12 +174,21 @@ endif()
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
+  include(CheckCXXCompilerFlag)
+  if (tensorflow_ENABLE_MKL_SUPPORT)
+    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
+    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
+      add_definitions(-DINTEL_MKL_ML)
+    endif()
+  endif()
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
+  if (COMPILER_OPT_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
   if (WIN32)
-    CHECK_CXX_COMPILER_FLAG("${tensorflow_WIN_CPU_SIMD_OPTIONS}" COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
+    CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${tensorflow_WIN_CPU_SIMD_OPTIONS}")
-    else()
-      message(FATAL_ERROR "${tensorflow_WIN_CPU_SIMD_OPTIONS} not supported")
     endif()
   endif()
 endif()
@@ -302,6 +323,43 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+if (tensorflow_ENABLE_MKL_SUPPORT)
+  if (WIN32)
+    find_path(MKL_HOME_PLATFORM mkl
+      PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES windows)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS
+      ${MKL_HOME_PLATFORM}/mkl/lib/intel64
+      ${MKL_HOME_PLATFORM}/tbb/lib/intel64/vc_mt
+      ${MKL_HOME_PLATFORM}/compiler/lib/intel64
+      ${MKL_HOME_PLATFORM}/mkl/tools/builder/lib)
+    set(MKL_REDIST_DLL_DIRS
+      ${MKL_HOME_PLATFORM}/redist/intel64/mkl
+      ${MKL_HOME_PLATFORM}/redist/intel64/tbb/vc_mt
+      ${MKL_HOME_PLATFORM}/redist/intel64/compiler)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES
+      mkl_intel_lp64_dll mkl_sequential_dll mkl_core_dll mkl_rt mkl_cdll_intel64)
+  endif()
+  if (UNIX)
+    # Fix me: complete the path on linux
+    find_path(MKL_HOME_PLATFORM mkl
+      HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      PATH_SUFFIXES linux)
+    set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
+    set(MKL_LINK_DIRS) # incompleted
+    set(MKL_REDIST_SO_DIRS) # incompleted
+  endif()
+  include_directories(${MKL_INCLUDE_DIRS})
+  link_directories(${MKL_LINK_DIRS})
+  if (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    include(mkldnn)
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
+    list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
+    include_directories(${mkldnn_INCLUDE_DIRS})
+  endif()
+endif (tensorflow_ENABLE_MKL_SUPPORT)
+
 if (tensorflow_ENABLE_GPU)
   if (NOT WIN32)
     # Default install paths for cuda libraries in Linux
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index fe83bb32046..0b79f718d48 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -128,6 +128,18 @@ Step-by-step Windows build
      D:\local\cuda\bin
      ```
 
+   * When building with MKL support after installing [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin directories to your PATH environment variable.
+
+     In case TensorFlow fails to find the MKL dll's during initialization, check your PATH environment variable.
+     It should contain the directory of the MKL dlls. For example:
+
+     ```
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
+     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
+     ```
+
+
    * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
      for example `cmake` is not in your path and it is installed in
      `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@@ -166,7 +178,15 @@ Step-by-step Windows build
    More? -Dtensorflow_ENABLE_GPU=ON ^
    More? -DCUDNN_HOME="D:\...\cudnn"
    ```
+   To build with MKL support add "^" at the end of the last line above following with:
+
+   ```
+   More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
+   More? -DMKL_HOME="D:\...\compilers_and_libraries"
+   ```
+
    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+
    ```
    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
    ```
@@ -226,6 +246,7 @@ Step-by-step Windows build
      ```
      ctest -C RelWithDebInfo
      ```
+
    * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
      serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
      After building the python wheel, you need to install the new wheel before running the tests.
@@ -234,6 +255,12 @@ Step-by-step Windows build
      ctest -C RelWithDebInfo
      ```
 
+   * `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL support. If MKL is enabled you need to install the [Intel Math Kernal Library](https://software.intel.com/en-us/mkl).
+     CMake will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
+
+   * `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
+
+
 4. Invoke MSBuild to build TensorFlow.
 
    To build the C++ example program, which will be created as a `.exe`
@@ -251,6 +278,7 @@ Step-by-step Windows build
    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
    ```
 
+
 Linux Continuous Integration build
 ==================================
 
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index a235442dc5c..cdaa6b73b93 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
-set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip)
+set(gemmlowp_HASH SHA256=b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/mkldnn.cmake b/tensorflow/contrib/cmake/external/mkldnn.cmake
new file mode 100644
index 00000000000..a639fdee367
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/mkldnn.cmake
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(mkldnn_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/include)
+set(mkldnn_URL https://github.com/01org/mkl-dnn.git)
+set(mkldnn_BUILD ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src)
+set(mkldnn_TAG 3063b2e4c943983f6bf5f2fb9a490d4a998cd291)
+
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/Release/mkldnn.lib)
+  else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/mkldnn.lib)
+  endif()
+else()
+    set(mkldnn_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/mkldnn/src/mkldnn/src/libmkldnn.a)
+endif()
+
+ExternalProject_Add(mkldnn
+    PREFIX mkldnn
+    GIT_REPOSITORY ${mkldnn_URL}
+    GIT_TAG ${mkldnn_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${mkldnn_STATIC_LIBRARIES}
+    INSTALL_COMMAND ""
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DMKLINC:STRING=${MKL_INCLUDE_DIRS}
+)
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 6cd66a65990..ad2af01bc00 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -15,32 +15,33 @@
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://storage.googleapis.com/libpng-public-archive/libpng-1.2.53.tar.gz)
-set(png_HASH SHA256=e05c9056d7f323088fd7824d8c6acc03a4a758c4b4916715924edc5dd3223a72)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz)
+set(png_HASH SHA256=e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(png_STATIC_LIBRARIES 
-      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
   else()
     if(CMAKE_BUILD_TYPE EQUAL Debug)
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_staticd.lib)
     else()
       set(png_STATIC_LIBRARIES 
-        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng16_static.lib)
     endif()
   endif()
 else()
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
+  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng16.a)
 endif()
 
 set(png_HEADERS
-    "${png_INSTALL}/include/libpng12/png.h"
-    "${png_INSTALL}/include/libpng12/pngconf.h"
+    "${png_INSTALL}/include/libpng16/png.h"
+    "${png_INSTALL}/include/libpng16/pngconf.h"
+    "${png_INSTALL}/include/libpng16/pnglibconf.h"
 )
 
 ExternalProject_Add(png
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 57c4ae76517..7f835d2d519 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
-set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2018/sqlite-amalgamation-3230100.zip)
+set(sqlite_HASH SHA256=4239a1f69e5721d07d9a374eb84d594225229e54be4ee628da2995f4315d8dfc)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347fe..b47c32f1c48 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
@@ -341,9 +341,3 @@ add_dependencies(tf_core_framework
     tf_core_lib
     proto_text
 )
-
-if(WIN32)
-  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
-  # Instead of defining this global, limit it to tf_core_framework where its used.
-  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
-endif()
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index f6aaf41f735..c4bdb69d828 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -554,12 +554,13 @@ if(WIN32)
         set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
     endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+    math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+            --bitness "${tensorflow_target_bitness}"
         BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
@@ -589,6 +590,12 @@ add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_deffile}
 )
 
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(pywrap_tensorflow_internal PRIVATE gcc_s gcc)
+endif()
+
 if(WIN32)
     add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
 endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 9738bbeb9ae..38f40452b53 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -52,12 +52,13 @@ if(WIN32)
     set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
   endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
-
+  math(EXPR tensorflow_target_bitness "${CMAKE_SIZEOF_VOID_P}*8")
   add_custom_command(TARGET tensorflow_static POST_BUILD
       COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
           --input "${tensorflow_static_dependencies}"
           --output "${tensorflow_deffile}"
           --target tensorflow.dll
+          --bitness "${tensorflow_target_bitness}"
   )
 endif(WIN32)
 
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 91ca33f4c4d..af48ef1fd40 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -65,6 +65,12 @@ if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
+    if (NOT tensorflow_BUILD_CC_TESTS)
+        file(GLOB tf_stream_executor_gpu_tests
+            "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*_test.cc"
+        )
+        list(REMOVE_ITEM tf_stream_executor_gpu_srcs ${tf_stream_executor_gpu_tests})
+    endif()
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()
 
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index 53c2285699a..cffe069aa35 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -63,7 +63,7 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"^(TFE_\w*)$|"
                         r"tensorflow::|"
                         r"functor::|"
-                        r"nsync_|"
+                        r"\?nsync_|"
                         r"perftools::gputools")
 
 # We want to identify data members explicitly in the DEF file, so that no one
@@ -87,6 +87,7 @@ def get_args():
                       required=True)
   parser.add_argument("--output", help="output deffile", required=True)
   parser.add_argument("--target", help="name of the target", required=True)
+  parser.add_argument("--bitness", help="build target bitness", required=True)
   args = parser.parse_args()
   return args
 
@@ -125,7 +126,10 @@ def main():
     # Header for the def file.
     def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
-    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+    if args.bitness == "64":
+      def_fp.write("\t??1OpDef@tensorflow@@UEAA@XZ\n")
+    else:
+      def_fp.write("\t??1OpDef@tensorflow@@UAE@XZ\n")
 
     # Each symbols returned by undname matches the same position in candidates.
     # We compare on undname but use the decorated name from candidates.
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 721dc4d0801..a5e065b93a2 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -281,6 +281,21 @@ class CrfTest(test.TestCase):
         self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
                          expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecodeZeroSeqLength(self):
+    """
+    Test that crf_decode works when sequence_length contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tags, scores = sess.run(values)
+      self.assertEqual(len(tags.shape), 2)
+      self.assertEqual(len(scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 1233c8f251c..e37c029cebf 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -479,15 +479,17 @@ def crf_decode(potentials, transition_params, sequence_length):
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    # sequence length is not allowed to be less than zero
+    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
         inputs=inputs,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
     backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
-        backpointers, sequence_length - 1, seq_dim=1)
+        backpointers, sequence_length_less_one, seq_dim=1)
 
     # Computes backward decoding. Extract tag indices from backpointers.
     crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
@@ -497,7 +499,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
         crf_bwd_cell,
         inputs=backpointers,
-        sequence_length=sequence_length - 1,
+        sequence_length=sequence_length_less_one,
         initial_state=initial_state,
         time_major=False,
         dtype=dtypes.int32)
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 00d9544602a..d58198faf35 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -358,7 +358,8 @@ class _CudnnRNN(base_layer.Layer):
             "CUDA/CuDNN generations.")
       # Initialize opaque params with a tensor.
       self.kernel = vs.get_variable(
-          "opaque_kernel", initializer=opaque_params_t, validate_shape=False)
+          "opaque_kernel", dtype=self._plain_dtype,
+          initializer=opaque_params_t, validate_shape=False)
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 9d1e8b20c2a..d59dd17aea4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -482,12 +482,11 @@ py_test(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
     srcs = ["prefetching_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index dbc35097ddd..78ecce8f7da 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -163,7 +163,7 @@ class DatasetSerializationTestBase(test.TestCase):
                                  num_outputs,
                                  sparse_tensors=False,
                                  verify_exhausted=True):
-    """Verifies that restoring into an already initilized iterator works.
+    """Verifies that restoring into an already initialized iterator works.
 
     Args:
       ds_fn: See `run_core_tests`.
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index f8556a1b282..43aa4b1bd02 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -409,7 +409,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRaces(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
     Args:
@@ -495,7 +495,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def _testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy=False):
     """Tests where all the workers race in producing elements.
 
-    Note: this is in contrast with the prevous test which carefully sequences
+    Note: this is in contrast with the previous test which carefully sequences
     the execution of the map functions.
 
 
@@ -928,8 +928,7 @@ class DirectedInterleaveDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def _normalize(self, vec):
-    batched = (len(vec.shape) == 2)
-    return vec / vec.sum(axis=1, keepdims=True) if batched else vec / vec.sum()
+    return vec / vec.sum()
 
   def _chi2(self, expected, actual):
     actual = np.asarray(actual)
@@ -938,35 +937,43 @@ class DirectedInterleaveDatasetTest(test.TestCase):
     chi2 = np.sum(diff * diff / expected, axis=0)
     return chi2
 
+  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
+    # Create a dataset that samples each integer in `[0, num_datasets)`
+    # with probability given by `weights[i]`.
+    dataset = interleave_ops.sample_from_datasets([
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(num_datasets)
+    ], weights)
+    dataset = dataset.take(num_samples)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      freqs = np.zeros([num_datasets])
+      for _ in range(num_samples):
+        freqs[sess.run(next_element)] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    return freqs
+
   def testSampleFromDatasets(self):
-    random_seed.set_random_seed(1618)
+    random_seed.set_random_seed(1619)
     num_samples = 10000
-    rand_probs = self._normalize(np.random.random_sample((10,)))
-    rand_probs2 = self._normalize(np.random.random_sample((15,)))
+    rand_probs = self._normalize(np.random.random_sample((15,)))
 
-    for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]:
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs]:
       probs = np.asarray(probs)
+      classes = len(probs)
+      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
-      # Create a dataset that samples each integer in `[0, probs.shape[0])`
-      # with probability given by `probs[i]`.
-      dataset = interleave_ops.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(i).repeat(None)
-          for i in range(probs.shape[0])
-      ], probs)
-      dataset = dataset.take(num_samples)
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with self.test_session() as sess:
-        freqs = np.zeros_like(probs)
-        for _ in range(num_samples):
-          freqs[sess.run(next_element)] += 1
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
-
-      # Use chi-squared test to assert that the observed distribution
-      # matches the expected distribution. Based on the implementation
-      # in "tensorflow/python/kernel_tests/multinomial_op_test.py".
+      # Also check that `weights` as a dataset samples correctly.
+      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
+      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
 
   def testErrors(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 7acbc676ceb..5c74ed6ae72 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -201,6 +201,14 @@ class StatsDatasetSerializationTest(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
 
+  def test_bytes_produced_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.bytes_produced_stats(["bytes_produced"])),
+          None, 100)
+
   def testBytesStatsDatasetSaveableCore(self):
     num_outputs = 100
     self.run_core_tests(
@@ -218,6 +226,14 @@ class StatsDatasetSerializationTest(
     return dataset_ops.Dataset.range(num_elements).apply(
         stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
 
+  def test_latency_stats_invalid_tag_shape(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(
+          lambda: dataset_ops.Dataset.range(100).apply(
+              stats_ops.latency_stats(["record_latency", "record_latency_2"])),
+          None, 100)
+
   def testLatencyStatsDatasetSaveableCore(self):
     num_outputs = 100
 
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 106a1ef388a..812a50ecbf1 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -200,10 +200,11 @@ def sample_from_datasets(datasets, weights=None, seed=None):
 
   Args:
     datasets: A list of @{tf.data.Dataset} objects with compatible structure.
-    weights: (Optional.) A list of `len(datasets)` floating-point values,
-      where `weights[i]` represents the probability with which an element
-      should be sampled from `datasets[i]`. Defaults to a uniform distribution
-      across `datasets`.
+    weights: (Optional.) A list of `len(datasets)` floating-point values where
+      `weights[i]` represents the probability with which an element should be
+      sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each
+      element is such a list. Defaults to a uniform distribution across
+      `datasets`.
     seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
       random seed that will be used to create the distribution. See
       @{tf.set_random_seed} for behavior.
@@ -219,24 +220,23 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   """
   num_datasets = len(datasets)
   if weights is None:
-    weights = array_ops.ones(
-        [num_datasets], dtype=dtypes.float32, name="weights")
-  else:
+    weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat()
+  elif not isinstance(weights, dataset_ops.Dataset):
     weights = ops.convert_to_tensor(weights, name="weights")
     if weights.dtype not in (dtypes.float32, dtypes.float64):
       raise TypeError("`weights` must be convertible to a tensor of "
                       "`tf.float32` or `tf.float64` elements.")
     if not weights.shape.is_compatible_with([num_datasets]):
       raise ValueError("`weights` must be a vector of length `len(datasets)`.")
+    weights = dataset_ops.Dataset.from_tensors(weights).repeat()
 
   # The `stateless_multinomial()` op expects log-probabilities, as opposed to
   # weights.
-  logits = math_ops.log(weights, name="logits")
-
-  def select_dataset(seed):
+  logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+  def select_dataset(logits, seed):
     return array_ops.squeeze(
-        stateless.stateless_multinomial([logits], 1, seed=seed), axis=[0, 1])
-
-  selector_input = random_ops.RandomDataset(seed).batch(2).map(select_dataset)
+        stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+  selector_input = dataset_ops.Dataset.zip(
+      (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
 
   return DirectedInterleaveDataset(selector_input, datasets)
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 89c04dc89a2..e4c9f8b58a2 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -114,11 +114,13 @@ class _PrefetchToDeviceIterator(object):
       ret = remote_iterator.get_next()
       return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
+    iterator_device = gen_dataset_ops.iterator_get_device(
+        self._input_iterator._iterator_resource)
+
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
-          target_device=gen_dataset_ops.iterator_get_device(
-              self._input_iterator._iterator_resource),
+          target_device=iterator_device,
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
           shared_name=shared_name)
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 711a538697a..60ef7efba4b 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -57,7 +57,7 @@ class _ScanDataset(dataset_ops.Dataset):
     self._output_shapes = None
     self._output_types = None
 
-    # Iteratively rerun the scan function until reaching a fixed pont on
+    # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
     need_to_rerun = True
     while need_to_rerun:
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index c8d795c3f6a..243b5a03485 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -584,7 +584,6 @@ class DistributionShapeTest(test.TestCase):
 
   def testDistributionShapeGetDimsStatic(self):
     with self.test_session():
-      shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 1a7f7b85e68..4032e755f6e 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -102,7 +102,6 @@ class SaverTest(test.TestCase):
       # Can still restore it.
       saver.restore(ckpt_prefix)
       self.assertEqual(v1.read_value().numpy(), 1.0)
-      self.assertEqual(v1.read_value().numpy(), 1.0)
       # However, cannot restore it with default name.
       with self.assertRaisesOpError('not found in checkpoint'):
         saver = _saver.Saver([v1, v2]).restore(ckpt_prefix)
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index ae2fd8b4902..3dcf0374c8a 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -485,7 +485,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
           reduction=losses.Reduction.NONE)
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
-          unweighted_loss, axis=-1, keep_dims=True)
+          unweighted_loss, axis=-1, keepdims=True)
     weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index fa2697800ec..a8774d6dab9 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -456,7 +456,7 @@ def _get_local_devices(device_type):
 
 
 def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labes into batches."""
+  """Split input features and labels into batches."""
 
   def ensure_divisible_by_shards(sequence):
     batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
@@ -602,7 +602,7 @@ def _local_device_setter(worker_device, ps_devices, ps_strategy):
 
 
 def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with approproriately scaled loss."""
+  """Produce an EstimatorSpec with appropriately scaled loss."""
   if tower_spec.loss is None:
     return tower_spec
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 5d77bc77e12..ccdd679d6ae 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -54,10 +54,10 @@ def _covariance(x, diag):
   diagonal matrix just the diagonal is returned.
   """
   num_points = math_ops.to_float(array_ops.shape(x)[0])
-  x -= math_ops.reduce_mean(x, 0, keep_dims=True)
+  x -= math_ops.reduce_mean(x, 0, keepdims=True)
   if diag:
     cov = math_ops.reduce_sum(
-        math_ops.square(x), 0, keep_dims=True) / (num_points - 1)
+        math_ops.square(x), 0, keepdims=True) / (num_points - 1)
   else:
     cov = math_ops.matmul(x, x, transpose_a=True) / (num_points - 1)
   return cov
@@ -313,7 +313,7 @@ class GmmAlgorithm(object):
     # TODO(xavigonzalvo): look into alternatives to log for
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
-        math_ops.log(self._covs + 1e-3), 1, keep_dims=True)
+        math_ops.log(self._covs + 1e-3), 1, keepdims=True)
     diff = shard - self._means
     x2 = math_ops.square(diff)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
@@ -351,7 +351,7 @@ class GmmAlgorithm(object):
       shard_id: id of current shard_id.
     """
     self._prior_probs[shard_id] = math_ops.reduce_logsumexp(
-        self._probs[shard_id], axis=1, keep_dims=True)
+        self._probs[shard_id], axis=1, keepdims=True)
 
   def _define_expectation_operation(self, shard_id):
     # Shape broadcasting.
@@ -375,7 +375,7 @@ class GmmAlgorithm(object):
     """
     # Soft assignment of each data point to each of the two clusters.
     self._points_in_k[shard_id] = math_ops.reduce_sum(
-        self._w[shard_id], 0, keep_dims=True)
+        self._w[shard_id], 0, keepdims=True)
     # Partial means.
     w_mul_x = array_ops.expand_dims(
         math_ops.matmul(
@@ -454,7 +454,7 @@ class GmmAlgorithm(object):
     for shard_id, prior_probs in enumerate(self._prior_probs):
       op.append(prior_probs + math_ops.log(self._w[shard_id]))
     self._scores = array_ops.squeeze(
-        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
+        math_ops.reduce_logsumexp(op, axis=2, keepdims=True), axis=0)
 
 
 def gmm(inp,
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index bfe338c9f9a..9ffdd3ba5e8 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             between vectors `u` and `v` is defined as \\(||u - v||_2\\)
              which is the square root of the sum of the absolute squares of
              the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
+             `u` and `v` is defined as \\(1 - (u . v) / (||u||_2 ||v||_2)\\).
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index bb4f1eb3847..11397e86bd8 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -118,12 +118,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
+from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest']
+_allowed_symbols = ['nest', 'broadcast_to']
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index a2834b64893..8fc4f60492b 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -48,7 +48,7 @@ class LocalVariabletest(test.TestCase):
       variables = variables_lib.local_variables()
       self.assertEquals(2, len(variables))
       self.assertRaises(errors_impl.OpError, sess.run, variables)
-      variables_lib.initialize_variables(variables).run()
+      variables_lib.variables_initializer(variables).run()
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
 
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index a97adf622e6..983b6dc8e5a 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -65,7 +65,7 @@ def fused_conv2d_bias_activation(conv_input,
     side_input_scale: A scalar `float32` that will be multiplied by side_input.
         This is optional and defaults to 0.
     side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
+        This is useful for implementing ResNet blocks.
     activation_mode: (optional) currently must be the default "Relu".
         Note that in qint8 mode, it also clips to 127, so acts like ReluX.
     data_format: Specifies the data format.
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index bb155aa2496..3d0ed899322 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -566,7 +566,7 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
+def CalculateConvolvedOutputDim(input_dim, filter_dim, stride, padding_type):
   """Calculates the size of an output dimension of a strided convolution.
 
   Given the sizes of the corresponding dimension of the input and filter shapes,
@@ -827,10 +827,10 @@ class FusedConvInt8Tests(test.TestCase):
             maxval=1.0,
             dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
 
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
+    output_height = CalculateConvolvedOutputDim(input_height, filter_height,
+                                                vertical_stride, padding_type)
+    output_width = CalculateConvolvedOutputDim(input_width, filter_width,
+                                               horizontal_stride, padding_type)
     print("output_height=", output_height, ", output_width=", output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
index 4b10bc0f8e6..4b1105f6bd4 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -161,7 +161,7 @@ def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
     proj = random_ops.random_normal(
         [array_ops.shape(a)[1], random_projection_dim])
     proj *= math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
     # Project both distributions and sort them.
     proj_a = math_ops.matmul(a, proj)
     proj_b = math_ops.matmul(b, proj)
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index f8b372546b6..650eab97a39 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -64,11 +64,11 @@ def _statistics(x, axes):
   y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
 
   # Compute true mean while keeping the dims for proper broadcasting.
-  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keep_dims=True))
+  shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True))
 
-  shifted_mean = math_ops.reduce_mean(y - shift, axes, keep_dims=True)
+  shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True)
   mean = shifted_mean + shift
-  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keep_dims=True)
+  mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True)
 
   mean = array_ops.squeeze(mean, axes)
   mean_squared = array_ops.squeeze(mean_squared, axes)
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 163993a3f6b..68e34f3b093 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -42,11 +42,12 @@ If you've finished walking through the quick start guide, you may want to try bu
 
 ### Build libhexagon\_nn\_skel.so
 
-Download Hexagon NN library from codeaurora.org and build it.
+Download Hexagon NN library from codeaurora.org and build it. For Hexagon SDK 3.0, we need use the compatible version([721b2d58f](https://source.codeaurora.org/quic/hexagon_nn/nnlib/commit/?id=721b2d58f0f4e2d5b182f41e6b7c4db5356bf0fb)) of nnlib.
 
 ```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
+git reset 721b2d58f --hard
 ```
 
 Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index 1be97ae3d6e..bbb3a3b18fd 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,7 +53,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat().data(),
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b22..ca49635d5d0 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index e97267fb89f..295908d44b9 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -137,7 +137,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d6..bd784c6bda0 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index a8d8cf8c5c6..d3c114a88d6 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -438,7 +438,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb5..0ceb683ff4c 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index e7d4243fc3d..42d525c2c21 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -613,19 +613,19 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
   def multiply_fisher(self, vector):
     probs = self._probs
     return vector * probs - probs * math_ops.reduce_sum(
-        vector * probs, axis=-1, keep_dims=True)
+        vector * probs, axis=-1, keepdims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=-1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=-1, keep_dims=True)
+        probs * vector, axis=-1, keepdims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
index 705a871d482..4279cb27928 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
@@ -33,7 +33,6 @@ _allowed_symbols = [
     "CategoricalLogitsNegativeLogProbLoss",
     "OnehotCategoricalLogitsNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
-    "MultiBernoulliNegativeLogProbLoss",
     "insert_slice_in_zeros",
 ]
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index 0727f4cf887..39e9d65407f 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -660,7 +660,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, {('channel', 'hihowareyou')})
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
@@ -668,7 +668,7 @@ class ReduceSumTest(Base):
     sum_lt = ops.reduce_sum(self.original_lt, ('channel', 'hihowareyou'))
     golden_lt = core.LabeledTensor(
         math_ops.reduce_sum(
-            self.original_lt.tensor, 1, keep_dims=True),
+            self.original_lt.tensor, 1, keepdims=True),
         [self.a0, ('channel', ['hihowareyou']), self.a2, self.a3])
     self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 
diff --git a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
index f701647c2b2..28ddaa69a14 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
+++ b/tensorflow/contrib/layers/python/kernel_tests/sparse_feature_cross_op_test.py
@@ -200,7 +200,7 @@ class SparseCrossOpTest(test.TestCase):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_large_batch(self):
-    """Tests with large batch size to force multithreding.
+    """Tests with large batch size to force multithreading.
     """
     batch_size = 5000
     col1 = []
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 9ccb589d698..3ae07cedab0 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -48,7 +48,7 @@ you should choose depends on (1) the feature type and (2) the model type.
    recommended.
 
      embedded_dept_column = embedding_column(
-       sparse_column_with_keys("department", ["math", "philosphy", ...]),
+       sparse_column_with_keys("department", ["math", "philosophy", ...]),
        dimension=10)
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 78affea44cb..06060b99e7e 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -815,7 +815,7 @@ class _Transformer(object):
   """
 
   def __init__(self, columns_to_tensors):
-    """Initializes transfomer.
+    """Initializes transformer.
 
     Args:
       columns_to_tensors: A mapping from feature columns to tensors. 'string'
@@ -908,7 +908,7 @@ def _gather_feature_columns(feature_columns):
 
 
 def _check_forbidden_sequence_columns(feature_columns):
-  """Recursively cecks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
+  """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`."""
   all_feature_columns = _gather_feature_columns(feature_columns)
   for feature_column in all_feature_columns:
     if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS):
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 25c3b1e7ea0..2f3e57653c5 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
 
-convolution2d = convolution
-convolution3d = convolution
+convolution1d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
+
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1411,7 +1543,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a9..b01fd5d5c95 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 392a490be15..8c118402a4c 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -60,8 +60,8 @@ class RevBlockTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
 
-      self.assertAllClose(x1, x1_inv)
-      self.assertAllClose(x2, x2_inv)
+      self.assertAllClose(x1, x1_inv, atol=1e-5)
+      self.assertAllClose(x2, x2_inv, atol=1e-5)
 
   def testBackwardForward(self):
 
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 3409860add8..645dc1291eb 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -294,7 +294,6 @@ class NPositiveIntegersTest(test.TestCase):
     self.assertEqual(utils.n_positive_integers(2, 2), (2, 2))
     self.assertEqual(utils.n_positive_integers(2, (2, 3)), (2, 3))
     self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
-    self.assertEqual(utils.n_positive_integers(3, (2, 3, 1)), (2, 3, 1))
     self.assertEqual(
         utils.n_positive_integers(3, tensor_shape.TensorShape([2, 3, 1])),
         (2, 3, 1))
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index b28835a8097..584556992a0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 8c85c431be6..14ee2ba6094 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -299,6 +299,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # so instead of breaking compatibility with that assumption, we
     # just manually initialize this field:
     self._train_distribute = None
+    self._device_fn = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index b4504f246a0..65fba52d461 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -90,7 +90,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
 $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c)
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
 CORE_CC_EXCLUDE_SRCS := \
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index a93ed201d64..436c3e1d4ca 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,12 +30,15 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -91,6 +94,7 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index b0236e9c608..98d3b5bb8ad 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -326,10 +326,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
@@ -373,10 +369,6 @@
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index fe208e47d1a..50cc146a87e 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -29,7 +29,7 @@ interpreter->AllocateTensors();
 float* input = interpreter->typed_input_tensor(0);
 // Fill `input`.
 interpreter->Invoke();
-float* output = interpreter->type_output_tensor(0);
+float* output = interpreter->typed_output_tensor(0);
 ```
 ### Data Alignment
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 300786c3ca0..18f64651889 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -54,6 +54,9 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
+import android.widget.CompoundButton;
+import android.widget.NumberPicker;
+import android.widget.ToggleButton;
 import android.widget.TextView;
 import android.widget.Toast;
 import java.io.IOException;
@@ -82,6 +85,8 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
+  private ToggleButton toggle;
+  private NumberPicker np;
   private ImageClassifier classifier;
 
   /** Max preview width that is guaranteed by Camera2 API */
@@ -289,6 +294,24 @@ public class Camera2BasicFragment extends Fragment
   public void onViewCreated(final View view, Bundle savedInstanceState) {
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
+    toggle = (ToggleButton) view.findViewById(R.id.button);
+
+    toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
+      public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
+        classifier.setUseNNAPI(isChecked);
+      }
+    });
+
+    np = (NumberPicker) view.findViewById(R.id.np);
+    np.setMinValue(1);
+    np.setMaxValue(10);
+    np.setWrapSelectorWheel(true);
+    np.setOnValueChangedListener(new NumberPicker.OnValueChangeListener() {
+      @Override
+      public void onValueChange(NumberPicker picker, int oldVal, int newVal){
+        classifier.setNumThreads(newVal);
+      }
+    });
   }
 
   /** Load the model and labels. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index c57bb348c5b..d32c0779101 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -142,6 +142,16 @@ public abstract class ImageClassifier {
     }
   }
 
+  public void setUseNNAPI(Boolean nnapi) {
+    if (tflite != null)
+        tflite.setUseNNAPI(nnapi);
+  }
+
+  public void setNumThreads(int num_threads) {
+    if (tflite != null)
+        tflite.setNumThreads(num_threads);
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index 15305c436e0..db557ad62f6 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -22,24 +22,59 @@
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
         android:layout_alignParentStart="true"
+        android:layout_alignParentLeft="true"
         android:layout_alignParentTop="true" />
 
     
 
-        
 
     
 
+    
+
+        
+
+        
+    
+
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
index a08ec3eb629..29a033bcd43 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -21,4 +21,6 @@
     NN:On
     NN:Off
     Use NNAPI
+    tflite
+    NNAPI
 
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index e915e65aa13..e84ee711298 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -215,6 +215,13 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  public void setNumThreads(int num_threads) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The interpreter has already been closed.");
+    }
+    wrapper.setNumThreads(num_threads);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index dfc8ac111a2..2fc803715be 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -153,6 +153,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
+  void setNumThreads(int num_threads) {
+    numThreads(interpreterHandle, num_threads);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -324,6 +328,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
+  private static native void numThreads(long interpreterHandle, int num_threads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index ccfdfd829b4..45f510da1d9 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -320,6 +320,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
   interpreter->UseNNAPI(static_cast(state));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetNumThreads(static_cast(num_threads));
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
     JNIEnv* env, jclass clazz, jint size) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 0e28a77feea..eaa765cb343 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -61,7 +61,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (JZ)
+ *  Signature: (JZ)V
  */
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
@@ -69,6 +69,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
                                                            jlong handle,
                                                            jboolean state);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JI)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jint num_threads);
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 63ea89df56b..e0aa070e2d0 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -176,7 +176,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output);
   } else {
     context->ReportError(context,
-                         "Inputs and outputs not all float|unit8 types.");
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index 6dd243ad62e..ec380c8e495 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -106,6 +106,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_DIV
 }
 
+
+
 template 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast(node->builtin_data);
@@ -118,7 +120,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (output->type == kTfLiteFloat32) {
     EvalFloat(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Div only supports FLOAT32 and quantized UINT8 now.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d585bcca0e5..9e9aba0169b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -4374,7 +4374,7 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint;
   using FixedPoint0 = gemmlowp::FixedPoint;
 
-  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ae295cc8b58..4c8cbe42759 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1403,6 +1403,33 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1444,18 +1471,6 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] / input2_data[i], output_activation_min,
-        output_activation_max);
-  }
-}
-
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 66b06aeaec5..7c60a4fdbff 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -174,7 +174,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context,
+                         "Inputs and outputs not all float|uint8 types.");
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 477e7f13da3..38e0005890a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -32,7 +32,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
-  // that that is the selected input. Other graph transformations on other nodes
+  // that is the selected input. Other graph transformations on other nodes
   // such as ResolveTensorFlowSwitch, will take care of trimming the
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 705a9d69a62..482cc71d8b3 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -152,9 +152,9 @@ enum class AxesOrder {
 };
 
 // The type of the scalars in an array.
-// Note that that does not by itself tell whether the values in the array are
-// real (are literally interpreted as real numbers) or quantized (only acquire
-// a meaning as real numbers in conjunction with QuantizationParams).
+// Note that the type does not by itself tell whether the values in the array
+// are real (are literally interpreted as real numbers) or quantized (only
+// acquire a meaning as real numbers in conjunction with QuantizationParams).
 //
 // In practice though:
 //   float values are always real
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 8c3a8afe7a0..bdad34a665e 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 __all__ = [
     "absolute_difference", "add_loss", "cosine_distance",
@@ -651,11 +652,9 @@ def cosine_distance(predictions,
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup(
+      "axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   with ops.name_scope(scope, "cosine_distance_loss",
                       [predictions, labels, weights]) as scope:
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index 2b9eee4ef7b..de76acb51ff 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -711,7 +711,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
       candidate_scores, margin_multiplier * nmi_scores)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   return candidate_ids[argmax_index]
 
@@ -811,7 +811,7 @@ def update_medoid_per_cluster(pairwise_distances, pairwise_distances_subset,
   candidate_scores = math_ops.add(scores_fac, margin_multiplier * scores_margin)
 
   argmax_index = math_ops.to_int32(
-      math_ops.argmax(candidate_scores, dimension=0))
+      math_ops.argmax(candidate_scores, axis=0))
 
   best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
   chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 48953e2e384..eff9081e35c 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,7 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
+# the archive has been propagated in mirror.bazel.build.
+GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 4090c1ff3e5..f37a2593e26 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -348,7 +348,7 @@ def _freeze_graph_with_def_protos(input_graph_def, output_node_names,
                                   input_saver_def, input_checkpoint):
   """Converts all variables in a graph and checkpoint into constants.
 
-  During this process, we need to retain certain initialzer nodes (e.g. table
+  During this process, we need to retain certain initializer nodes (e.g. table
   initializer nodes). Instead of determining which dependencies
   of the shared initializer node (e.g. group_deps) to keep, we
   reconstruct the connections between the individual initializer nodes and
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 5364e3075da..00a933e5e0c 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2834,7 +2834,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2953,7 +2955,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -3011,7 +3015,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
@@ -3351,7 +3358,7 @@ def streaming_mean_cosine_distance(predictions,
   radial_diffs = math_ops.reduce_sum(
       radial_diffs, reduction_indices=[
           dim,
-      ], keep_dims=True)
+      ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
                                             name or 'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 63fc487dca6..e65925610c5 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -88,7 +88,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
     return math_ops.reduce_logsumexp(
         math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
         axis=1,
-        keep_dims=False)
+        keepdims=False)
 
   # Calling this protected form of embedding_lookup allows co-locating
   # the logsumexp computation with the partitioned weights, which yields
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index c57c5e3f29f..612ecc3e638 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
         "python/training/elastic_average_optimizer.py",
@@ -43,11 +44,27 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_test(
+    name = "adamax_test",
+    srcs = ["python/training/adamax_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 6c1bb1adc09..4c13c8e2471 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
@@ -36,6 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'AdaMaxOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adamax.py b/tensorflow/contrib/opt/python/training/adamax.py
new file mode 100644
index 00000000000..686bac0d840
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""AdaMax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+
+
+class AdaMaxOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the AdaMax algorithm.
+
+  Adamax is sometimes superior to adam, specially in models with embeddings,
+  see [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="AdaMax"):
+    """Construct a new AdaMax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdaMax".
+    """
+    super(AdaMaxOptimizer, self).__init__(learning_rate, beta1, beta2,
+                                          epsilon, use_locking, name)
+
+  def _get_beta_accumulators(self):
+    if context.executing_eagerly():
+      graph = None
+    else:
+      graph = ops.get_default_graph()
+    return self._get_non_slot_variable("beta1_power", graph=graph)
+
+  def _create_slots(self, var_list):
+    # Create the beta1 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.apply_ada_max(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_ada_max(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices,
+                           scatter_add, scatter_update):
+    beta1_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = scatter_update(m, indices, m_t_slice)
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, "v")
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta1_power) * (m_t_slice /
+                                             (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices,
+        self._resource_scatter_add, self._resource_scatter_update)
+
+  def _finish(self, update_ops, name_scope):
+    # Update the power accumulators.
+    with ops.control_dependencies(update_ops):
+      beta1_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+    return control_flow_ops.group(*update_ops + [update_beta1],
+                                  name=name_scope)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
new file mode 100644
index 00000000000..bc92a7006f1
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -0,0 +1,348 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AdaMax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adamax
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**t)) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - ((alpha / (1 - beta1**t)) *
+                                    (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+class AdaMaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.AdaMaxOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.AdaMaxOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.AdaMaxOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/AdaMax:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of AdaMax
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.AdaMaxOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined AdaMax1 and AdaMax2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adamax.AdaMaxOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.AdaMaxOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index 85e3e8d3791..ac04ad99110 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -85,7 +85,7 @@ class MovingAverageOptimizerTest(test.TestCase):
               state_ops.assign_add(ema_var1, [4.0, 4.0])
           ])
 
-          # Test taht saver with missing ema variables will fail.
+          # Test that saver with missing ema variables will fail.
           with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
             opt.swapping_saver(var_list=[var0])
 
@@ -123,7 +123,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
             self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
             self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previou state.
+            # Restore back to previous state.
             train_saver.restore(sess, save_path)
 
           # If updates are parallel, this is not always true after the 1st step.
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 6ade4ccd52c..8ac9b581455 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -456,7 +456,7 @@ class CheckpointingTests(test.TestCase):
         optimizer.apply_gradients(
             [(g, v) for g, v in zip(grad, model.vars)])
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index dcb5bb6416a..46bfbb729fa 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -564,7 +564,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   ### State
 
-  Internal methods apre passed a `state` argument with the correct
+  Internal methods are passed a `state` argument with the correct
   values to use for the slot and non-slot variables, and the hyper
   parameters.
   """
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 4a8f8a04cc5..aa0ef643088 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -545,7 +545,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
         gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 0232103c418..cd162bae25a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.contrib.seq2seq.python.ops import basic_decoder
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -110,7 +111,12 @@ class AttentionWrapperTest(test.TestCase):
                          alignment_history=False,
                          expected_final_alignment_history=None,
                          attention_layer_size=6,
+                         attention_layer=None,
                          name=''):
+    attention_layer_sizes = (
+        [attention_layer_size] if attention_layer_size is not None else None)
+    attention_layers = (
+        [attention_layer] if attention_layer is not None else None)
     self._testWithMaybeMultiAttention(
         is_multi=False,
         create_attention_mechanisms=[create_attention_mechanism],
@@ -119,7 +125,8 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depths=[attention_mechanism_depth],
         alignment_history=alignment_history,
         expected_final_alignment_history=expected_final_alignment_history,
-        attention_layer_sizes=[attention_layer_size],
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
         name=name)
 
   def _testWithMaybeMultiAttention(self,
@@ -131,6 +138,7 @@ class AttentionWrapperTest(test.TestCase):
                                    alignment_history=False,
                                    expected_final_alignment_history=None,
                                    attention_layer_sizes=None,
+                                   attention_layers=None,
                                    name=''):
     # Allow is_multi to be True with a single mechanism to enable test for
     # passing in a single mechanism in a list.
@@ -144,12 +152,18 @@ class AttentionWrapperTest(test.TestCase):
     encoder_output_depth = 10
     cell_depth = 9
 
-    if attention_layer_sizes is None:
-      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
-    else:
+    if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
       attention_depth = sum([attention_layer_size or encoder_output_depth
                              for attention_layer_size in attention_layer_sizes])
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth])[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
 
     decoder_inputs = array_ops.placeholder_with_default(
         np.random.randn(batch_size, decoder_max_time,
@@ -171,13 +185,20 @@ class AttentionWrapperTest(test.TestCase):
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
+        attention_layer_size = attention_layer_sizes
+        attention_layer = attention_layers
+        if not is_multi:
+          if attention_layer_size is not None:
+            attention_layer_size = attention_layer_size[0]
+          if attention_layer is not None:
+            attention_layer = attention_layer[0]
         cell = rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
             cell,
             attention_mechanisms if is_multi else attention_mechanisms[0],
-            attention_layer_size=(attention_layer_sizes if is_multi
-                                  else attention_layer_sizes[0]),
-            alignment_history=alignment_history)
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history,
+            attention_layer=attention_layer)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -260,6 +281,41 @@ class AttentionWrapperTest(test.TestCase):
             expected_final_alignment_history,
             final_alignment_history_info)
 
+  def testBahdanauNormalizedDType(self):
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.BahdanauAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          normalize=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
@@ -797,6 +853,48 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testMultiAttentionWithLayerInstances(self):
+    create_attention_mechanisms = (
+        wrapper.BahdanauAttention, wrapper.LuongAttention)
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 7), dtype=dtype('float32'), mean=0.0011709079),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=3.2000000000000002))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0038725811),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019329828)),
+        attention=ResultSummary(
+            shape=(5, 7), dtype=dtype('float32'), mean=0.001174294),
+        time=3,
+        alignments=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        alignment_history=())
+
+    expected_final_alignment_history = (
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
+        ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125))
+
+    self._testWithMaybeMultiAttention(
+        True,
+        create_attention_mechanisms,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depths=[9, 9],
+        attention_layers=[layers_core.Dense(3, use_bias=False),
+                          layers_core.Dense(4, use_bias=False)],
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testMultiAttention')
+
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 8a40a7ab537..1c9d179e3c5 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -472,7 +472,8 @@ def _bahdanau_score(processed_query, keys, normalize):
     # Scalar used in weight normalization
     g = variable_scope.get_variable(
         "attention_g", dtype=dtype,
-        initializer=math.sqrt((1. / num_units)))
+        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
+        shape=())
     # Bias added prior to the nonlinearity
     b = variable_scope.get_variable(
         "attention_b", [num_units], dtype=dtype,
@@ -1082,7 +1083,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                cell_input_fn=None,
                output_attention=True,
                initial_cell_state=None,
-               name=None):
+               name=None,
+               attention_layer=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1125,7 +1127,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         (default), use the context as attention at each time step. Otherwise,
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
-        attention_layer_size must be a list of the same length.
+        attention_layer_size must be a list of the same length. If
+        attention_layer is set, this must be None.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1145,12 +1148,19 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         does not match the batch size of `initial_cell_state`, proper
         behavior is not guaranteed.
       name: Name to use when creating ops.
+      attention_layer: A list of `tf.layers.Layer` instances or a
+        single `tf.layers.Layer` instance taking the context and cell output as
+        inputs to generate attention at each time step. If None (default), use
+        the context as attention at each time step. If attention_mechanism is a
+        list, attention_layer must be a list of the same length. If
+        attention_layers_size is set, this must be None.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
         is a list but `attention_layer_size` is not; or vice versa).
       ValueError: if `attention_layer_size` is not None, `attention_mechanism`
-        is a list, and its length does not match that of `attention_layer_size`.
+        is a list, and its length does not match that of `attention_layer_size`;
+        if `attention_layer_size` and `attention_layer` are set simultaneously.
     """
     super(AttentionWrapper, self).__init__(name=name)
     rnn_cell_impl.assert_like_rnncell("cell", cell)
@@ -1181,6 +1191,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
 
+    if attention_layer_size is not None and attention_layer is not None:
+      raise ValueError("Only one of attention_layer_size and attention_layer "
+                       "should be set")
+
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
           attention_layer_size
@@ -1199,6 +1213,22 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
+    elif attention_layer is not None:
+      self._attention_layers = tuple(
+          attention_layer
+          if isinstance(attention_layer, (list, tuple))
+          else (attention_layer,))
+      if len(self._attention_layers) != len(attention_mechanisms):
+        raise ValueError(
+            "If provided, attention_layer must contain exactly one "
+            "layer per attention_mechanism, saw: %d vs %d"
+            % (len(self._attention_layers), len(attention_mechanisms)))
+      self._attention_layer_size = sum(
+          layer.compute_output_shape(
+              [None,
+               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+          for layer, mechanism in zip(
+              self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 35c4b5bec17..345eb6cfaa6 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.signal.python.kernel_tests import test_util
 from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -173,6 +174,18 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  def test_num_spectrogram_bins_dynamic(self):
+    with self.test_session(use_gpu=True):
+      num_spectrogram_bins = array_ops.placeholder(shape=(),
+                                                   dtype=dtypes.int32)
+      mel_matrix_np = spectrogram_to_mel_matrix(
+          20, 129, 8000.0, 125.0, 3800.0)
+      mel_matrix = mel_ops.linear_to_mel_weight_matrix(
+          20, num_spectrogram_bins, 8000.0, 125.0, 3800.0)
+      self.assertAllClose(
+          mel_matrix_np,
+          mel_matrix.eval(feed_dict={num_spectrogram_bins: 129}), atol=3e-6)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index d1a36548d95..1e84006116d 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -64,14 +64,11 @@ def _hertz_to_mel(frequencies_hertz, name=None):
         1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
 
 
-def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+def _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype):
   """Checks the inputs to linear_to_mel_weight_matrix."""
   if num_mel_bins <= 0:
     raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
-  if num_spectrogram_bins <= 0:
-    raise ValueError('num_spectrogram_bins must be positive. Got: %s' %
-                     num_spectrogram_bins)
   if sample_rate <= 0.0:
     raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
   if lower_edge_hertz < 0.0:
@@ -122,9 +119,9 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.
-    num_spectrogram_bins: Python int. How many bins there are in the source
-      spectrogram data, which is understood to be `fft_size // 2 + 1`, i.e. the
-      spectrogram only contains the nonredundant FFT bins.
+    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
+      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
+      i.e. the spectrogram only contains the nonredundant FFT bins.
     sample_rate: Python float. Samples per second of the input signal used to
       create the spectrogram. We need this to figure out the actual frequencies
       for each spectrogram bin, which dictates how they are mapped into the mel
@@ -148,7 +145,10 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
   with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
-    _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
+    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
+    # and the validation is already done in linspace (both in shape function
+    # and in kernel), there is no need to validate num_spectrogram_bins here.
+    _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
     # To preserve accuracy, we compute the matrix at float64 precision and then
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 40f484fd783..746b9556423 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -290,9 +290,9 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 
 In addition to the types of scope mechanisms in TensorFlow
 ([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope)),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope).
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -761,8 +761,8 @@ parts:
 3. Finalization: (optionally) perform any final operation to compute metric
 values. For example, computing means, mins, maxes, etc.
 
-For example, to compute `mean_absolute_error`, two variables, a `count` and
-`total` variable are *initialized* to zero. During *aggregation*, we observed
+For example, to compute `mean_absolute_error`, two variables (`count` and
+`total`) are *initialized* to zero. During *aggregation*, we observed
 some set of predictions and labels, compute their absolute differences and add
 the total to `total`. Each time we observe another value,
 `count` is incremented. Finally, during *finalization*, `total` is divided
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 6a200de1ea1..8a2c74742a8 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -389,7 +389,7 @@ def create_train_op(total_loss,
     total_loss: A `Tensor` representing the total loss.
     optimizer: A tf.Optimizer to use for computing the gradients.
     global_step: A `Tensor` representing the global step variable. If left as
-      `_USE_GLOBAL_STEP`, then slim.variables.global_step() is used.
+      `_USE_GLOBAL_STEP`, then tf.contrib.framework.global_step() is used.
     update_ops: An optional list of updates to execute. If `update_ops` is
       `None`, then the update ops are set to the contents of the
       `tf.GraphKeys.UPDATE_OPS` collection. If `update_ops` is not `None`, but
@@ -578,7 +578,8 @@ def train(train_op,
     is_chief: Specifies whether or not the training is being run by the primary
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
-      then slim.variables.get_or_create_global_step() is used.
+      then training_util.get_or_create_global_step(), that is,
+      tf.contrib.framework.global_step() is used.
     number_of_steps: The max number of gradient steps to take during training,
       as measured by 'global_step': training will stop if global_step is
       greater than 'number_of_steps'. If the value is left as None, training
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index 235a595de49..11c4214176a 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -207,7 +207,7 @@ def resnet_v1(inputs,
         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers.conv2d(
               net,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 61665c9c8ba..19e0538dd1e 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -221,7 +221,7 @@ def resnet_v2(inputs,
             net, activation_fn=nn_ops.relu, scope='postnorm')
         if global_pool:
           # Global average pooling.
-          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
         if num_classes is not None:
           net = layers_lib.conv2d(
               net,
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 4abcc20ed33..35e8c92aba3 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -399,7 +399,7 @@ def get_combined_model_fn(model_fns):
   training ops: tf.group them.
   loss: average them.
   predictions: concat probabilities such that predictions[*][0-C1] are the
-    probablities for output 1 (where C1 is the number of classes in output 1),
+    probabilities for output 1 (where C1 is the number of classes in output 1),
     predictions[*][C1-(C1+C2)] are the probabilities for output 2 (where C2
     is the number of classes in output 2), etc.  Also stack predictions such
     that predictions[i][j] is the class prediction for example i and output j.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index cf0db788a41..06bfe871fdf 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -80,7 +80,7 @@ REGISTER_OP("HardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index c9df09bfda4..1a055756c08 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -85,7 +85,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
    regression model that translates from node features to
    probabilities.
 
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index b0d8b832b54..7d092bbc24d 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -81,7 +81,7 @@ REGISTER_OP("StochasticHardRoutingGradient")
   tree_biases: `tree_biases[i]` gives the bias of the logistic
    regression model that translates from node features to
    probabilities.
-  path_probility: `path_probability[i]` gives the probability of reaching each
+  path_probability: `path_probability[i]` gives the probability of reaching each
    node in `path[i]`.
   path: `path[i][j]` gives the jth node in the path taken by the ith data
    instance.
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 44997ec5d6d..cefcc960510 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -421,7 +421,7 @@ double getChebyshevEpsilon(const std::vector& mu1,
                            const std::vector& mu2) {
   // Math time!!
   // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface.
-  // Using Langrange multipliers, we get
+  // Using Lagrange multipliers, we get
   //   partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x
   //   partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y
   // or
@@ -485,7 +485,7 @@ double getChebyshevEpsilon(const std::vector& mu1,
   }
 
   double sdiscrim = sqrt(discrim);
-  // TODO(thomaswc): Analyze whetever one of these is always closer.
+  // TODO(thomaswc): Analyze whatever one of these is always closer.
   double v1 = (-b + sdiscrim) / (2 * a);
   double v2 = (-b - sdiscrim) / (2 * a);
   double dist1 = getDistanceFromLambda3(v1, mu1, mu2);
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index edbac670067..03aab1b61ee 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -123,7 +123,7 @@ bool BestSplitDominatesRegression(const Tensor& total_sums,
                                   const Tensor& split_squares,
                                   int32 accumulator);
 
-// Performs booststrap_samples bootstrap samples of the best split's class
+// Performs bootstrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
 // least dominate_fraction of the time, the former has a better (lower)
 // Gini impurity.  Does not take over ownership of *rand.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index 328af28725a..d3edb437337 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -60,7 +60,7 @@ class DecisionTreeResource : public ResourceBase {
   mutex* get_mutex() { return &mu_; }
 
   // Return the TreeNode for the leaf that the example ends up at according
-  // to decsion_tree_. Also fill in that leaf's depth if it isn't nullptr.
+  // to decision_tree_. Also fill in that leaf's depth if it isn't nullptr.
   int32 TraverseTree(const std::unique_ptr& input_data,
                      int example, int32* depth, TreePath* path) const;
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index bf2b2aaa3c8..3db351c328c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -60,7 +60,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
   bool include_equals_;
 };
 
-// Evalutor for splits with multiple weighted features.
+// Evaluator for splits with multiple weighted features.
 class ObliqueInequalityDecisionNodeEvaluator
     : public BinaryDecisionNodeEvaluator {
  public:
diff --git a/tensorflow/contrib/tensor_forest/ops/model_ops.cc b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
index 3099cccdf8b..98124d519c7 100644
--- a/tensorflow/contrib/tensor_forest/ops/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/model_ops.cc
@@ -165,7 +165,7 @@ tree_handle: The handle to the tree.
 leaf_ids: `leaf_ids[i]` is the leaf id for input i.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 )doc");
 
diff --git a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
index e8b5c5d8a6e..5be581aaec4 100644
--- a/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/ops/stats_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("GrowTreeV4")
     .Attr("params: string")
     .Input("tree_handle: resource")
     .Input("stats_handle: resource")
-    .Input("finshed_nodes: int32")
+    .Input("finished_nodes: int32")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
 Grows the tree for finished nodes and allocates waiting nodes.
@@ -83,7 +83,7 @@ Grows the tree for finished nodes and allocates waiting nodes.
 params: A serialized TensorForestParams proto.
 tree_handle: The handle to the tree.
 stats_handle: The handle to the stats.
-finshed_nodes: A 1-d Tensor of finished node ids from ProcessInput.
+finished_nodes: A 1-d Tensor of finished node ids from ProcessInput.
 )doc");
 
 REGISTER_OP("ProcessInputV4")
@@ -119,7 +119,7 @@ sparse_input_values: The values tensor from the SparseTensor input.
 sparse_input_shape: The shape tensor from the SparseTensor input.
 input_labels: The training batch's labels as a 1 or 2-d tensor.
   'input_labels[i][j]' gives the j-th label/target for the i-th input.
-input_weights: The training batch's eample weights as a 1-d tensor.
+input_weights: The training batch's weights as a 1-d tensor.
   'input_weights[i]' gives the weight for the i-th input.
 finished_nodes: A 1-d tensor of node ids that have finished and are ready to
   grow.
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 3650b5d52fe..b9bcbb170b0 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -212,7 +212,7 @@ class ForestHParams(object):
     self.regression = getattr(self, 'regression', False)
 
     # Num_outputs is the actual number of outputs (a single prediction for
-    # classification, a N-dimenensional point for regression).
+    # classification, a N-dimensional point for regression).
     self.num_outputs = self.num_classes if self.regression else 1
 
     # Add an extra column to classes for storing counts, which is needed for
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2f316767b35..f80b4f1b112 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,6 +11,7 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -52,7 +53,6 @@ tf_custom_op_library(
         "ops/trt_engine_op.cc",
     ],
     deps = [
-        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([
@@ -140,6 +140,7 @@ tf_custom_op_py_library(
     ]),
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:resources",
     ],
@@ -174,6 +175,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":wrap_conversion",
+        "//tensorflow/python:tf_optimizer",
     ],
 )
 
@@ -183,6 +185,7 @@ tf_py_wrap_cc(
     copts = tf_copts(),
     deps = [
         ":trt_conversion",
+        ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
     ],
@@ -272,3 +275,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+py_test(
+    name = "tf_trt_integration_test",
+    srcs = ["test/tf_trt_integration_test.py"],
+    main = "test/tf_trt_integration_test.py",
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":init_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 6eafc1754ca..687dee07e13 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -1,59 +1,29 @@
 # Using TensorRT in TensorFlow
 
-
-This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT. This is still a work in progress
-but should be useable with most common graphs.
+This module provides necessary bindings and introduces TRT_engine_op operator
+that wraps a subgraph in TensorRT. This is still a work in progress but should
+be useable with most common graphs.
 
 ## Compilation
 
-
-In order to compile the module, you need to have a local TensorRT
-installation ( libnvinfer.so and respective include files ). During the
-configuration step, TensorRT should be enabled and installation path
-should be set. If installed through package managers (deb,rpm),
-configure script should find the necessary components from the system
-automatically. If installed from tar packages, user has to set path to
-location where the library is installed during configuration.
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
 
 ```shell
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
-After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py script
+After the installation of tensorflow package, TensorRT transformation will be
+available. An example use can be found in test/test_tftrt.py script
 
 ## Installing TensorRT 3.0.4
 
-In order to make use of TensorRT integration, you will need a local installation of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt). Due to compiler compatibility, you will need to download and install the TensorRT 3.0.4 tarball for _Ubuntu 14.04_, i.e., **_TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz_**, even if you are using Ubuntu 16.04 or later.
-
-### Preparing TensorRT installation
-
-Once you have downloaded TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz, you will need to unpack it to an installation directory, which will be referred to as . Please replace  with the full path of actual installation directory you choose in commands below.
-
-```shell
-cd  && tar -zxf /path/to/TensorRT-3.0.4.Ubuntu-14.04.5.x86_64.cuda-9.0.cudnn7.0-tar.gz
-```
-
-After unpacking the binaries, you have several options to use them:
-
-#### To run TensorFlow as a user without superuser privileges
-
-For a regular user without any sudo rights, you should add TensorRT to your `$LD_LIBRARY_PATH`:
-
-  ```shell
-   export LD_LIBRARY_PATH=/TensorRT-3.0.4/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-  ```
-
-Then you are ready to use TensorFlow-TensorRT integration. `$LD_LIBRARY_PATH` must contain the path to TensorRT installation for TensorFlow-TensorRT integration to work. If you are using a VirtualEnv-like setup, you can add the command above to your `bin/activate` script or to your `.bashrc` script.
-
-#### To run TensorFlow as a superuser
-
- When running as a superuser, such as in a container or via sudo, the `$LD_LIBRARY_PATH` approach above may not work. The following is preferred when the user has superuser privileges:
-
-  ```shell
-  echo "/TensorRT-3.0.4/lib" | sudo tee /etc/ld.so.conf.d/tensorrt304.conf && sudo ldconfig
-  ```
-
-  Please ensure that any existing deb package installation of TensorRT is removed before following these instructions to avoid package conflicts.
\ No newline at end of file
+In order to make use of TensorRT integration, you will need a local installation
+of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow Installation page](https://www.tensorflow.org/install/install_linux#nvidia_requirements_to_run_tensorflow_with_gpu_support).
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
index e663eed4dd6..9c3698e5d1c 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
@@ -19,6 +19,12 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+std::shared_ptr
+tensorflow::tensorrt::TRTResourceManager::instance() {
+  static std::shared_ptr instance_(new TRTResourceManager);
+  return instance_;
+}
+
 std::shared_ptr
 tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
   // mutex is held for lookup only. Most instantiations where mutex will be held
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index 5f8ad491d3c..bc15b51e05e 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -29,11 +29,7 @@ class TRTResourceManager {
   TRTResourceManager() = default;
 
  public:
-  static std::shared_ptr instance() {
-    static std::shared_ptr instance_(
-        new TRTResourceManager);
-    return instance_;
-  }
+  static std::shared_ptr instance();
   // returns a manager for given op, if it doesn't exists it creates one
   std::shared_ptr getManager(const string& op_name);
 
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
new file mode 100644
index 00000000000..7a473287628
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+import numpy as np
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.with_c_api
+class IntegrationTest(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration."""
+
+  def setUp(self):
+    """Setup method."""
+    super(IntegrationTest, self).setUp()
+    warnings.simplefilter("always")
+    inp_dims = (100, 24, 24, 2)
+    self._input = np.random.random_sample(inp_dims)
+    self._original_graph = self.get_simple_graph_def()
+    self._gpu_options = cpb2.GPUOptions(
+        per_process_gpu_memory_fraction=0.50)
+    self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
+    self._reference = self.run_graph(self._original_graph, self._input)
+
+  def get_simple_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = aops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      e = cop.constant(
+          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+          name="weights",
+          dtype=dtypes.float32)
+      conv = nn.conv2d(
+          input=a,
+          filter=e,
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          name="conv")
+      b = cop.constant(
+          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = aops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      aops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def run_calibration(self, gdef, dumm_inp):
+    """Run given calibration graph multiple times."""
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+      # run over real calibration data here, we are mimicking a calibration
+      # set of 30 different batches. Use as much calibration data as you want
+    with self.test_session(
+        graph=g, config=self._config, use_gpu=True,
+        force_gpu=True) as sess:
+      for _ in range(30):
+        val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def get_trt_graph(self, mode):
+    """Return trt converted graph."""
+    if mode in  ["FP32", "FP16", "INT8"]:
+      return trt.create_inference_graph(
+          input_graph_def=self._original_graph,
+          outputs=["output"],
+          max_batch_size=self._input.shape[0],
+          max_workspace_size_bytes=1 << 25,
+          precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
+          minimum_segment_size=2  # minimum number of nodes in an engine
+          )
+    return None
+
+  def testFP32(self):
+    """Test FP32 conversion. Results should be identical to native case."""
+    trt_graph = self.get_trt_graph("FP32")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testFP16(self):
+    """Test FP16 conversion. Results may be different from native case."""
+    trt_graph = self.get_trt_graph("FP16")
+    result = self.run_graph(trt_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(trt_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+  def testINT8(self):
+    """Test INT8 conversion. Results may be different from native case."""
+    calib_graph = self.get_trt_graph("INT8")
+    result = self.run_calibration(calib_graph, self._input)
+    self.assertAllEqual(self._reference, result)
+    int8_graph = trt.calib_graph_to_infer_graph(calib_graph)
+    result = self.run_graph(int8_graph, self._input)
+    self.assertAllClose(self._reference, result, rtol=1.e-03)
+    result1 = self.run_graph(int8_graph, self._input)
+    self.assertAllEqual(result1, result)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 26793c80bfb..9b593fecbb3 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -60,7 +60,7 @@ def clip_covariance(
   # TODO(allenl): Smarter scaling here so that correlations are preserved when
   # fiddling with diagonal elements.
   diagonal = array_ops.matrix_diag_part(covariance_matrix)
-  maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True)
+  maximum = math_ops.reduce_max(diagonal, axis=-1, keepdims=True)
   new_diagonal = gen_math_ops.maximum(
       diagonal, maximum / maximum_variance_ratio)
   return array_ops.matrix_set_diag(
diff --git a/tensorflow/contrib/training/python/training/resample.py b/tensorflow/contrib/training/python/training/resample.py
index b16159bc16b..7b8332b1d67 100644
--- a/tensorflow/contrib/training/python/training/resample.py
+++ b/tensorflow/contrib/training/python/training/resample.py
@@ -77,7 +77,7 @@ def resample_at_rate(inputs, rates, scope=None, seed=None, back_prop=False):
 
   Args:
     inputs: A list of tensors, each of which has a shape of `[batch_size, ...]`
-    rates: A tensor of shape `[batch_size]` contiaining the resampling rates
+    rates: A tensor of shape `[batch_size]` containing the resampling rates
        for each input.
     scope: Scope for the op.
     seed: Random seed to use.
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index ba888f87dc8..7140f2a46d5 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -123,7 +123,7 @@ def rejection_sample(tensors,
         batch_size=batch_size,
         num_threads=queue_threads)
 
-    # Queues return a single tensor if the list of enqued tensors is one. Since
+    # Queues return a single tensor if the list of enqueued tensors is one. Since
     # we want the type to always be the same, always return a list.
     if isinstance(minibatch, ops.Tensor):
       minibatch = [minibatch]
@@ -312,7 +312,7 @@ def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
   for probs in probs_list:
-    # Since number of classes shouldn't change at runtime, probalities shape
+    # Since number of classes shouldn't change at runtime, probabilities shape
     # should be fully defined.
     probs.get_shape().assert_is_fully_defined()
 
@@ -407,7 +407,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs):
   ```
 
 
-  A solution for a_i in terms of the other variabes is the following:
+  A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
   """
   # Make list of t_i / p_i.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 99d486b1833..39d75a08060 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -876,7 +876,7 @@ class SequenceQueueingStateSaver(object):
         ]):
           self._length = array_ops.identity(self._length)
 
-        # Only create barrier; enqueu and dequeue operations happen when you
+        # Only create barrier; enqueue and dequeue operations happen when you
         # access prefetch_op and next_batch.
         self._create_barrier()
         self._scope = scope
@@ -1637,7 +1637,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
 
   For `key, value` pairs in `input_context` with `SparseTensor` `value` removes
   them from `input_context` and transforms the `value` into a sequence and
-  then adding `key`, transformed `value` into `input_seuqences`.
+  then adding `key`, transformed `value` into `input_sequences`.
   The transformation is done by adding a new first dimension of `value_length`
   equal to that of the other values in input_sequences` and tiling the `value`
   every `num_unroll` steps.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a2ff29724bb..ba1fd415655 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -145,6 +145,7 @@ load(
     "if_static",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -247,6 +248,15 @@ tf_nano_proto_library(
     deps = [":protos_all_cc"],
 )
 
+proto_library(
+    name = "example_protos",
+    srcs = [
+        "example/example.proto",
+        "example/feature.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -4066,3 +4076,9 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
+
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 00000000000..145d05de59a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <>> x = tf.constant([1, 2, 3])
+>>> y = tf.broadcast_to(x, [3, 3])
+>>> sess.run(y)
+array([[1, 2, 3],
+       [1, 2, 3],
+       [1, 2, 3]], dtype=int32)
+```
+In the above example, the input Tensor with the shape of `[1, 3]`
+is broadcasted to output Tensor with shape of `[3, 3]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
index 9b00f5b19d9..56a3658fa02 100644
--- a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
@@ -61,7 +61,7 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 
 The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 Each element must be in the range `[0, 255]` (It represents the value of a
 pixel in the output image).  Non-finite values in the input tensor are
 replaced by this tensor in the output image.  The default value is the color
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 00000000000..a3f2188ba50
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  visibility: HIDDEN
+  in_arg {
+    name: "var"
+    description: <
If you encounter installation problems, see [Common Installation Problems](#common_installation_problems). @@ -299,7 +321,7 @@ take the following steps:
      $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
      
If this step fails, see @@ -485,7 +507,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      (tensorflow)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
## Validate your installation @@ -659,14 +681,14 @@ This section documents the relevant values for Linux installations. CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -678,14 +700,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -697,14 +719,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
 
@@ -716,14 +738,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md index b3e9616a059..a237d1af540 100644 --- a/tensorflow/docs_src/install/install_mac.md +++ b/tensorflow/docs_src/install/install_mac.md @@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv: TensorFlow in the active Virtualenv is as follows:
 $ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl If you encounter installation problems, see [Common Installation Problems](#common-installation-problems). @@ -242,7 +242,7 @@ take the following steps: issue the following command:
 $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl 
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl If the preceding command fails, see [installation problems](#common-installation-problems). @@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment: TensorFlow for Python 2.7:
 (targetDirectory)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl @@ -524,7 +524,7 @@ The value you specify depends on your Python version.
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index 26287aa3a16..b1867586530 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -354,10 +354,10 @@ Invoke `pip install` to install that pip package. The filename of the `.whl` file depends on your platform. For example, the following command will install the pip package -for TensorFlow 1.7.0 on Linux: +for TensorFlow 1.8.0rc0 on Linux:
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl
 
## Validate your installation @@ -454,6 +454,8 @@ Stack Overflow and specify the `tensorflow` tag. **Linux** + + @@ -475,6 +477,7 @@ Stack Overflow and specify the `tensorflow` tag. **Mac**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.8.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.7.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.10.0N/AN/A
tensorflow_gpu-1.7.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.079
tensorflow-1.6.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.9.0N/AN/A
+ @@ -490,6 +493,8 @@ Stack Overflow and specify the `tensorflow` tag. **Windows**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.7.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.10.1N/AN/A
tensorflow-1.6.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.5.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
+ + diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md index 08a5fbe41c8..c35530061dc 100644 --- a/tensorflow/docs_src/mobile/android_build.md +++ b/tensorflow/docs_src/mobile/android_build.md @@ -51,7 +51,8 @@ If you haven't already, do the following two things: // set to 'bazel', 'cmake', 'makefile', 'none' def nativeBuildSystem = 'none' -4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu. +4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the + top menu. You may need to rebuild the project using *Build > Rebuild Project*. If it asks you to use Instant Run, click **Proceed Without Instant Run**. diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md index 411889cb1c6..2fea02d861d 100644 --- a/tensorflow/docs_src/performance/quantization.md +++ b/tensorflow/docs_src/performance/quantization.md @@ -110,7 +110,7 @@ we've added a separate rewrite for the *eval graph*: ``` # Build eval model -logits = tf.nn.softmax_cross_entropy_with_logits(...) +logits = tf.nn.softmax_cross_entropy_with_logits_v2(...) # Call the eval rewrite which rewrites the graph in-place with # FakeQuantization nodes and fold batchnorm for eval. diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md index f5a0eb0a200..f7817b06d4c 100644 --- a/tensorflow/docs_src/programmers_guide/debugger.md +++ b/tensorflow/docs_src/programmers_guide/debugger.md @@ -400,7 +400,7 @@ diff = -(y_ * tf.log(y)) to the built-in, numerically-stable implementation of softmax cross-entropy: ```python -diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits) +diff = tf.losses.softmax_cross_entropy(labels=y_, logits=logits) ``` Rerun with the `--debug` flag as follows: diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index aa72cae766c..f0dd8def17f 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -210,7 +210,7 @@ with tf.device("/device:GPU:0"): # Operations created in this context will be pinned to the GPU. result = tf.matmul(weights, img) ``` -If you are deploying TensorFlow in a @{$deploy/distributed$typical distributed configuration}, +If you are deploying TensorFlow in a @{$distributed$typical distributed configuration}, you might specify the job name and task ID to place variables on a task in the parameter server job (`"/job:ps"`), and the other operations on task in the worker job (`"/job:worker"`): @@ -362,7 +362,7 @@ operations that are needed to compute the result. @{tf.Session.run} requires you to specify a list of **fetches**, which determine the return values, and may be a @{tf.Operation}, a @{tf.Tensor}, or -a [tensor-like type](#tensor-like-objects) such as @{tf.Variable}. These fetches +a [tensor-like type](#tensor-like_objects) such as @{tf.Variable}. These fetches determine what **subgraph** of the overall @{tf.Graph} must be executed to produce the result: this is the subgraph that contains all operations named in the fetch list, plus all operations whose outputs are used to compute the value @@ -505,7 +505,7 @@ multiple graphs in the same process. As noted above, TensorFlow provides a "default graph" that is implicitly passed to all API functions in the same context. For many applications, a single graph is sufficient. However, TensorFlow also provides methods for manipulating -the default graph, which can be useful in more advanced used cases. For example: +the default graph, which can be useful in more advanced use cases. For example: * A @{tf.Graph} defines the namespace for @{tf.Operation} objects: each operation in a single graph must have a unique name. TensorFlow will diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md index 55ee42dd640..c6ef87c54a3 100644 --- a/tensorflow/docs_src/programmers_guide/saved_model.md +++ b/tensorflow/docs_src/programmers_guide/saved_model.md @@ -485,31 +485,7 @@ portion of the signature. That is, when writing a to expect and how to map them to your model's expected inputs. By contrast, the *output* portion of the signature is determined by the model. - -### Perform the export - -To export your trained Estimator, call -@{tf.estimator.Estimator.export_savedmodel} with the export base path and -the `serving_input_receiver_fn`. - -```py -estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn, - strip_default_attrs=True) -``` - -This method builds a new graph by first calling the -`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling -this `Estimator`'s `model_fn()` to generate the model graph based on those -features. It starts a fresh `Session`, and, by default, restores the most recent -checkpoint into it. (A different checkpoint may be passed, if needed.) -Finally it creates a time-stamped export directory below the given -`export_dir_base` (i.e., `export_dir_base/`), and writes a -SavedModel into it containing a single `MetaGraphDef` saved from this -Session. - -> Note: It is your responsibility to garbage-collect old exports. -> Otherwise, successive exports will accumulate under `export_dir_base`. - + ### Specify the outputs of a custom model When writing a custom `model_fn`, you must populate the `export_outputs` element @@ -541,6 +517,30 @@ using [`signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`](https://www.tens indicating which `SignatureDef` will be served when an inference request does not specify one. + +### Perform the export + +To export your trained Estimator, call +@{tf.estimator.Estimator.export_savedmodel} with the export base path and +the `serving_input_receiver_fn`. + +```py +estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn, + strip_default_attrs=True) +``` + +This method builds a new graph by first calling the +`serving_input_receiver_fn()` to obtain feature `Tensor`s, and then calling +this `Estimator`'s `model_fn()` to generate the model graph based on those +features. It starts a fresh `Session`, and, by default, restores the most recent +checkpoint into it. (A different checkpoint may be passed, if needed.) +Finally it creates a time-stamped export directory below the given +`export_dir_base` (i.e., `export_dir_base/`), and writes a +SavedModel into it containing a single `MetaGraphDef` saved from this +Session. + +> Note: It is your responsibility to garbage-collect old exports. +> Otherwise, successive exports will accumulate under `export_dir_base`. ### Serve the exported model locally diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md index cb0d86fc4c5..5e3e49d4340 100644 --- a/tensorflow/docs_src/programmers_guide/using_tpu.md +++ b/tensorflow/docs_src/programmers_guide/using_tpu.md @@ -280,8 +280,8 @@ Where `params['batch-size']` will contain the batch size. ### Static shapes and batch size The input pipeline generated by your `input_fn` is run on CPU. So it is mostly -free strict static shape requirements imposed by the XLA/TPU environment. The -one requirement is that the batches of data fed from your input pipeline to +free from the strict static shape requirements imposed by the XLA/TPU environment. +The one requirement is that the batches of data fed from your input pipeline to the TPU have a static shape, as determined by the standard TensorFlow shape inference algorithm. Intermediate tensors are free to have a dynamic shapes. If shape inference has failed, but the shape is known it is possible to diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md index 7d79f433c41..372ab47df7d 100644 --- a/tensorflow/docs_src/tutorials/audio_recognition.md +++ b/tensorflow/docs_src/tutorials/audio_recognition.md @@ -280,7 +280,7 @@ tool: ``` bazel run tensorflow/examples/wav_to_spectrogram:wav_to_spectrogram -- \ --input_wav=/tmp/speech_dataset/happy/ab00c4b2_nohash_0.wav \ ---output_png=/tmp/spectrogram.png +--output_image=/tmp/spectrogram.png ``` If you open up `/tmp/spectrogram.png` you should see something like this: diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md index cadaec391d8..37cd2bb1397 100644 --- a/tensorflow/docs_src/tutorials/layers.md +++ b/tensorflow/docs_src/tutorials/layers.md @@ -192,8 +192,7 @@ dive deeper into the `tf.layers` code used to create each layer, as well as how to calculate loss, configure the training op, and generate predictions. If you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s}, and find the above code intuitive, you may want to skim these sections or just -skip ahead to ["Training and Evaluating the CNN MNIST -Classifier"](#training_and_evaluating_the_cnn_mnist_classifier). +skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist). ### Input Layer @@ -536,8 +535,9 @@ if mode == tf.estimator.ModeKeys.TRAIN: ``` > Note: For a more in-depth look at configuring training ops for Estimator model -> functions, see @{$get_started/custom_estimators#defining_the_training_op_for_the_model$"Defining the training op for the model"} -> in the @{$get_started/custom_estimators$"Creating Estimators in tf.estimator."} tutorial. +> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"} +> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial. + ### Add evaluation metrics @@ -552,7 +552,8 @@ return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) ``` -## Training and Evaluating the CNN MNIST Classifier {#training_and_evaluating_the_cnn_mnist_classifier} + +## Training and Evaluating the CNN MNIST Classifier We've coded our MNIST CNN model function; now we're ready to train and evaluate it. @@ -612,9 +613,9 @@ following to `main()`: ```python # Set up logging for predictions - tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=50) +tensors_to_log = {"probabilities": "softmax_tensor"} +logging_hook = tf.train.LoggingTensorHook( + tensors=tensors_to_log, every_n_iter=50) ``` We store a dict of the tensors we want to log in `tensors_to_log`. Each key is a diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index 14ae7fbf358..b09ee997689 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -224,7 +224,7 @@ with graph.as_default(): optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. - norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) + norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index ec7d9dcc4f1..c31ca8b67a1 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -21159,7 +21159,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr { // generated sequentially as '*tag*/image/0', '*tag*/image/1', etc. // // The `bad_color` argument is the color to use in the generated images for -// non-finite input values. It is a `unit8` 1-D tensor of length `channels`. +// non-finite input values. It is a `uint8` 1-D tensor of length `channels`. // Each element must be in the range `[0, 255]` (It represents the value of a // pixel in the output image). Non-finite values in the input tensor are // replaced by this tensor in the output image. The default value is the color diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java index 489e95c3102..3948991c84d 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java +++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java @@ -101,6 +101,7 @@ public class LabelImage { b.constant("mean", mean)), b.constant("scale", scale)); try (Session s = new Session(g)) { + // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks. return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class); } } @@ -110,6 +111,7 @@ public class LabelImage { try (Graph g = new Graph()) { g.importGraphDef(graphDef); try (Session s = new Session(g); + // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks. Tensor result = s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) { final long[] rshape = result.shape(); diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 9dc03d7cdbc..8e7f0cadad7 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1946,7 +1946,8 @@ py_library( ":array_ops", ":constant_op", ":dtypes", - ":linalg_ops", + ":linalg_ops_gen", + ":linalg_ops_impl", ":math_ops", ":nn_ops", ":random_ops", @@ -1997,7 +1998,22 @@ py_library( ":array_ops", ":dtypes", ":framework_ops", + ":functional_ops", ":linalg_ops_gen", + ":linalg_ops_impl", + ":math_ops", + "//third_party/py/numpy", + ], +) + +py_library( + name = "linalg_ops_impl", + srcs = ["ops/linalg_ops_impl.py"], + srcs_version = "PY2AND3", + deps = [ + ":array_ops", + ":dtypes", + ":framework_ops", ":math_ops", "//third_party/py/numpy", ], @@ -3493,6 +3509,7 @@ tf_py_wrap_cc( "//tensorflow/core/profiler/internal:print_model_analysis", "//tensorflow/tools/graph_transforms:transform_graph_lib", "//tensorflow/python/eager:pywrap_tfe_lib", + "//tensorflow/python/eager:python_eager_op_gen", "//util/python:python_headers", ] + (tf_additional_lib_deps() + tf_additional_plugin_deps() + diff --git a/tensorflow/python/debug/cli/readline_ui.py b/tensorflow/python/debug/cli/readline_ui.py index 151638789f7..3296e45d07e 100644 --- a/tensorflow/python/debug/cli/readline_ui.py +++ b/tensorflow/python/debug/cli/readline_ui.py @@ -19,6 +19,8 @@ from __future__ import print_function import readline +import six + from tensorflow.python.debug.cli import base_ui from tensorflow.python.debug.cli import debugger_cli_common @@ -39,11 +41,7 @@ class ReadlineUI(base_ui.BaseUI): readline.set_completer(self._readline_complete) readline.parse_and_bind("tab: complete") - # For Python 2-3 compatibility. - try: - self._input = raw_input - except NameError: - self._input = input + self._input = six.moves.input def _readline_complete(self, text, state): context, prefix, except_last_word = self._analyze_tab_complete_input(text) diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py index fb9494f5763..1f9c8fa5a96 100644 --- a/tensorflow/python/debug/wrappers/grpc_wrapper.py +++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py @@ -21,6 +21,8 @@ import signal import sys import traceback +import six + # Google-internal import(s). from tensorflow.python.debug.lib import common from tensorflow.python.debug.wrappers import framework @@ -140,14 +142,9 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession): def _signal_handler(unused_signal, unused_frame): - try: - input_func = raw_input - except NameError: - # Python 3 does not have raw_input. - input_func = input - while True: - response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip() + response = six.moves.input( + "\nSIGINT received. Quit program? (Y/n): ").strip() if response in ("", "Y", "y"): sys.exit(0) elif response in ("N", "n"): diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py index 6705cd31e29..5e4604fda4d 100644 --- a/tensorflow/python/debug/wrappers/hooks.py +++ b/tensorflow/python/debug/wrappers/hooks.py @@ -31,15 +31,18 @@ from tensorflow.python.training import session_run_hook class LocalCLIDebugHook(session_run_hook.SessionRunHook): """Command-line-interface debugger hook. - Can be used as a monitor/hook for `tf.train.MonitoredSession`s and - `tf.contrib.learn`'s `Estimator`s and `Experiment`s. + Can be used as a hook for `tf.train.MonitoredSession`s and + `tf.estimator.Estimator`s. Provides a substitute for + `tfdbg.LocalCLIDebugWrapperSession` in cases where the session is not directly + available. """ def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None): """Create a local debugger command-line interface (CLI) hook. Args: - ui_type: (str) user-interface type. + ui_type: (`str`) requested user-interface type. Currently supported: + (curses | readline). dump_root: (`str`) optional path to the dump root directory. Must be a directory that does not exist or an empty directory. If the directory does not exist, it will be created by the debugger core during debug @@ -153,8 +156,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): class DumpingDebugHook(session_run_hook.SessionRunHook): """A debugger hook that dumps debug data to filesystem. - Can be used as a monitor/hook for `tf.train.MonitoredSession`s and - `tf.contrib.learn`'s `Estimator`s and `Experiment`s. + Can be used as a hook for `tf.train.MonitoredSession`s and + `tf.estimator.Estimator`s. """ def __init__(self, @@ -229,8 +232,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook): When the arguments of debug_utils.watch_graph changes, strongly consider changing arguments here too so that features are available to tflearn users. - Can be used as a monitor/hook for `tf.train.MonitoredSession`s and - `tf.contrib.learn`'s `Estimator`s and `Experiment`s. + Can be used as a hook for `tf.train.MonitoredSession`s and + `tf.estimator.Estimator`s. """ def __init__(self, diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py index c365ea8b4aa..efa4bdf5980 100644 --- a/tensorflow/python/estimator/canned/head.py +++ b/tensorflow/python/estimator/canned/head.py @@ -263,9 +263,12 @@ def _check_dense_labels_match_logits_and_reshape( if (dim1 is not None) and (dim1 != expected_labels_dimension): raise ValueError( 'Mismatched label shape. ' - 'Classifier configured with n_classes=%s. Received %s. ' - 'Suggested Fix: check your n_classes argument to the estimator ' - 'and/or the shape of your label.' % + 'Expected labels dimension=%s. Received %s. ' + 'Suggested Fix:' + 'If your classifier expects one-hot encoding label,' + 'check your n_classes argument to the estimator' + 'and/or the shape of your label.' + 'Otherwise, check the shape of your label.' % (expected_labels_dimension, dim1)) expected_labels_shape = array_ops.concat( [logits_shape[:-1], [expected_labels_dimension]], axis=0) diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index 351fcb64232..2f1212d5a2b 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -207,7 +207,8 @@ class Estimator(object): else: self._session_config = self._config.session_config - self._device_fn = _get_replica_device_setter(self._config) + self._device_fn = self._config.device_fn or \ + _get_replica_device_setter(self._config) if model_fn is None: raise ValueError('model_fn must be provided to Estimator.') @@ -716,7 +717,7 @@ class Estimator(object): batch_length = batch_length or value.shape[0] if value.shape[0] != batch_length: raise ValueError('Batch length of predictions should be same. %s has ' - 'different batch length then others.' % key) + 'different batch length than others.' % key) return batch_length def _extract_keys(self, predictions, predict_keys): diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py index dab442aeda6..8162b249f1f 100644 --- a/tensorflow/python/estimator/run_config.py +++ b/tensorflow/python/estimator/run_config.py @@ -27,11 +27,13 @@ import six from tensorflow.core.protobuf import config_pb2 from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import server_lib +from tensorflow.python.estimator import util from tensorflow.python.util import compat_internal from tensorflow.python.util.tf_export import tf_export _USE_DEFAULT = object() +_VALID_DEVICE_FN_ARGS = set(['op']) # A list of the property names in RunConfig that the user is allowed to change. _DEFAULT_REPLACEABLE_LIST = [ @@ -44,7 +46,8 @@ _DEFAULT_REPLACEABLE_LIST = [ 'keep_checkpoint_max', 'keep_checkpoint_every_n_hours', 'log_step_count_steps', - 'train_distribute' + 'train_distribute', + 'device_fn' ] _SAVE_CKPT_ERR = ( @@ -279,6 +282,11 @@ def _validate_properties(run_config): _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types), message='tf_random_seed must be integer.') + _validate('device_fn', lambda device_fn: six.callable(device_fn) and + set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS, + message='device_fn must be callable with exactly' + ' one argument "op".') + class TaskType(object): MASTER = 'master' @@ -302,7 +310,8 @@ class RunConfig(object): keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, - train_distribute=None): + train_distribute=None, + device_fn=None): """Constructs a RunConfig. All distributed training related properties `cluster_spec`, `is_chief`, @@ -430,6 +439,10 @@ class RunConfig(object): `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during training, according to the policy specified by that strategy. + device_fn: A callable invoked for every `Operation` that takes the + `Operation` and returns the device string. If `None`, defaults to + the device function returned by `tf.train.replica_device_setter` + with round-robin strategy. Raises: ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs` @@ -466,7 +479,8 @@ class RunConfig(object): keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, - train_distribute=train_distribute) + train_distribute=train_distribute, + device_fn=device_fn) self._init_distributed_setting_from_environment_var(tf_config) @@ -568,6 +582,16 @@ class RunConfig(object): def cluster_spec(self): return self._cluster_spec + @property + def device_fn(self): + """Returns the device_fn. + + If device_fn is not `None`, it overrides the default + device function used in `Estimator`. + Otherwise the default one is used. + """ + return self._device_fn + @property def evaluation_master(self): return self._evaluation_master @@ -697,7 +721,8 @@ class RunConfig(object): - `keep_checkpoint_max`, - `keep_checkpoint_every_n_hours`, - `log_step_count_steps`, - - `train_distribute`. + - `train_distribute`, + - `device_fn`. In addition, either `save_checkpoints_steps` or `save_checkpoints_secs` can be set (should not be both). diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py index a3eef4c53fd..c8b12605e1a 100644 --- a/tensorflow/python/estimator/run_config_test.py +++ b/tensorflow/python/estimator/run_config_test.py @@ -42,6 +42,7 @@ _SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto' _KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0' _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0' _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer' +_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".' _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.' _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.' _INVALID_TASK_TYPE_FOR_EVAL_MASTER = ( @@ -83,6 +84,7 @@ class RunConfigTest(test.TestCase): self.assertEqual(5, config.keep_checkpoint_max) self.assertEqual(10000, config.keep_checkpoint_every_n_hours) self.assertIsNone(config.service) + self.assertIsNone(config.device_fn) def test_model_dir(self): empty_config = run_config_lib.RunConfig() @@ -93,6 +95,7 @@ class RunConfigTest(test.TestCase): def test_replace_with_allowed_properties(self): session_config = config_pb2.ConfigProto(allow_soft_placement=True) + device_fn = lambda op: "/cpu:0" config = run_config_lib.RunConfig().replace( tf_random_seed=11, @@ -100,13 +103,15 @@ class RunConfigTest(test.TestCase): save_checkpoints_secs=14, session_config=session_config, keep_checkpoint_max=16, - keep_checkpoint_every_n_hours=17) + keep_checkpoint_every_n_hours=17, + device_fn=device_fn) self.assertEqual(11, config.tf_random_seed) self.assertEqual(12, config.save_summary_steps) self.assertEqual(14, config.save_checkpoints_secs) self.assertEqual(session_config, config.session_config) self.assertEqual(16, config.keep_checkpoint_max) self.assertEqual(17, config.keep_checkpoint_every_n_hours) + self.assertEqual(device_fn, config.device_fn) def test_replace_none_value(self): config = run_config_lib.RunConfig().replace( @@ -117,7 +122,8 @@ class RunConfigTest(test.TestCase): save_checkpoints_steps=None, session_config=None, keep_checkpoint_max=None, - keep_checkpoint_every_n_hours=None) + keep_checkpoint_every_n_hours=None, + device_fn=None) self.assertIsNone(config.tf_random_seed) self.assertIsNone(config.model_dir) self.assertIsNone(config.save_summary_steps) @@ -126,6 +132,7 @@ class RunConfigTest(test.TestCase): self.assertIsNone(config.session_config) self.assertIsNone(config.keep_checkpoint_max) self.assertIsNone(config.keep_checkpoint_every_n_hours) + self.assertIsNone(config.device_fn) def test_replace_with_disallowallowed_properties(self): config = run_config_lib.RunConfig() @@ -166,9 +173,12 @@ class RunConfigTest(test.TestCase): config.replace(keep_checkpoint_every_n_hours=0) with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR): config.replace(tf_random_seed=1.0) + with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR): + config.replace(device_fn=lambda x, y: 0) def test_init_with_allowed_properties(self): session_config = config_pb2.ConfigProto(allow_soft_placement=True) + device_fn = lambda op: "/cpu:0" config = run_config_lib.RunConfig( tf_random_seed=11, @@ -176,13 +186,15 @@ class RunConfigTest(test.TestCase): save_checkpoints_secs=14, session_config=session_config, keep_checkpoint_max=16, - keep_checkpoint_every_n_hours=17) + keep_checkpoint_every_n_hours=17, + device_fn=device_fn) self.assertEqual(11, config.tf_random_seed) self.assertEqual(12, config.save_summary_steps) self.assertEqual(14, config.save_checkpoints_secs) self.assertEqual(session_config, config.session_config) self.assertEqual(16, config.keep_checkpoint_max) self.assertEqual(17, config.keep_checkpoint_every_n_hours) + self.assertEqual(device_fn, config.device_fn) def test_init_none_value(self): config = run_config_lib.RunConfig( @@ -193,7 +205,8 @@ class RunConfigTest(test.TestCase): save_checkpoints_steps=None, session_config=None, keep_checkpoint_max=None, - keep_checkpoint_every_n_hours=None) + keep_checkpoint_every_n_hours=None, + device_fn=None) self.assertIsNone(config.tf_random_seed) self.assertIsNone(config.model_dir) self.assertIsNone(config.save_summary_steps) @@ -202,6 +215,7 @@ class RunConfigTest(test.TestCase): self.assertIsNone(config.session_config) self.assertIsNone(config.keep_checkpoint_max) self.assertIsNone(config.keep_checkpoint_every_n_hours) + self.assertIsNone(config.device_fn) def test_init_invalid_values(self): with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR): @@ -220,6 +234,8 @@ class RunConfigTest(test.TestCase): run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0) with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR): run_config_lib.RunConfig(tf_random_seed=1.0) + with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR): + run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0") class RunConfigDistributedSettingTest(test.TestCase): diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index a7c4eabcb26..c16c3cda489 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -162,7 +162,6 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import checkpoint_utils from tensorflow.python.util import nest from tensorflow.python.util.tf_export import tf_export -from tensorflow.python.util.tf_export import tf_export def _internal_input_layer(features, diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py index 807582bd7e5..7f9ef53457a 100644 --- a/tensorflow/python/framework/dtypes.py +++ b/tensorflow/python/framework/dtypes.py @@ -700,11 +700,13 @@ def as_dtype(type_value): if type_value.type == np.string_ or type_value.type == np.unicode_: return string - for key, val in _NP_TO_TF: - try: - if key == type_value: - return val - except TypeError as e: - raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e)) + if isinstance(type_value, (type, np.dtype)): + for key, val in _NP_TO_TF: + try: + if key == type_value: + return val + except TypeError as e: + raise TypeError("Cannot convert {} to a dtype. {}".format( + type_value, e)) raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value) diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py index 910364364c8..394fac6c856 100644 --- a/tensorflow/python/framework/graph_util_impl.py +++ b/tensorflow/python/framework/graph_util_impl.py @@ -285,7 +285,7 @@ def convert_variables_to_constants(sess, output_graph_def.node.extend([output_node]) output_graph_def.library.CopyFrom(inference_graph.library) - print("Converted %d variables to const ops." % how_many_converted) + logging.info("Converted %d variables to const ops.", how_many_converted) return output_graph_def diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py index b618152b025..2dafb94ba7e 100644 --- a/tensorflow/python/framework/graph_util_test.py +++ b/tensorflow/python/framework/graph_util_test.py @@ -209,7 +209,7 @@ class DeviceFunctionsTest(test.TestCase): defun_node, 2.0, name="output_node") with session.Session() as sess: - init = variables.initialize_variables([variable_node]) + init = variables.variables_initializer([variable_node]) sess.run(init) output = sess.run(output_node) self.assertNear(4.0, output, 0.00001) diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py index 535c6017f5f..9a8477debb0 100644 --- a/tensorflow/python/framework/load_library.py +++ b/tensorflow/python/framework/load_library.py @@ -58,7 +58,7 @@ def load_op_library(library_filename): op_list_str = py_tf.TF_GetOpList(lib_handle) op_list = op_def_pb2.OpList() op_list.ParseFromString(compat.as_bytes(op_list_str)) - wrappers = py_tf.GetPythonWrappers(op_list_str) + wrappers = py_tf.GetEagerPythonWrappers(op_list_str) # Delete the library handle to release any memory held in C # that are no longer needed. diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i index 26ec4e8e66b..efcce2f2094 100644 --- a/tensorflow/python/framework/python_op_gen.i +++ b/tensorflow/python/framework/python_op_gen.i @@ -16,10 +16,10 @@ limitations under the License. %include "tensorflow/python/platform/base.i" %{ -#include "tensorflow/python/framework/python_op_gen.h" +#include "tensorflow/python/eager/python_eager_op_gen.h" %} -// Input typemap for GetPythonWrappers. +// Input typemap for GetEagerPythonWrappers. // Accepts a python object of 'bytes' type, and converts it to // a const char* pointer and size_t length. The default typemap // going from python bytes to const char* tries to decode the @@ -37,5 +37,5 @@ limitations under the License. %ignoreall; -%unignore tensorflow::GetPythonWrappers; -%include "tensorflow/python/framework/python_op_gen.h" +%unignore tensorflow::GetEagerPythonWrappers; +%include "tensorflow/python/eager/python_eager_op_gen.h" diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index f954b9d6c73..5a8bc437273 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -1014,6 +1014,8 @@ class TensorFlowTestCase(googletest.TestCase): config.graph_options.optimizer_options.opt_level = -1 config.graph_options.rewrite_options.constant_folding = ( rewriter_config_pb2.RewriterConfig.OFF) + config.graph_options.rewrite_options.arithmetic_optimization = ( + rewriter_config_pb2.RewriterConfig.OFF) return config if graph is None: diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py index 5a84b16a23f..e3dd4b0bdfb 100644 --- a/tensorflow/python/grappler/layout_optimizer_test.py +++ b/tensorflow/python/grappler/layout_optimizer_test.py @@ -476,7 +476,7 @@ class LayoutOptimizerTest(test.TestCase): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) - reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keep_dims=True) + reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keepdims=True) squeeze = array_ops.squeeze(reduce_sum, axis=[1, 2]) output = array_ops.identity(squeeze) @@ -506,7 +506,7 @@ class LayoutOptimizerTest(test.TestCase): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) - reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keep_dims=True) + reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keepdims=True) squeeze = array_ops.squeeze(reduce_sum, axis=[0, 1, 2]) output = array_ops.identity(squeeze) @@ -623,7 +623,7 @@ class LayoutOptimizerTest(test.TestCase): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) - reduce_sum = math_ops.reduce_sum(conv, axis=[3], keep_dims=True) + reduce_sum = math_ops.reduce_sum(conv, axis=[3], keepdims=True) output = array_ops.identity(reduce_sum) with session.Session(config=_get_config(False)) as sess: @@ -653,7 +653,7 @@ class LayoutOptimizerTest(test.TestCase): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) - reduce_sum = math_ops.reduce_sum(conv, axis=[2], keep_dims=True) + reduce_sum = math_ops.reduce_sum(conv, axis=[2], keepdims=True) output = array_ops.identity(reduce_sum) with session.Session(config=_get_config(False)) as sess: @@ -682,7 +682,7 @@ class LayoutOptimizerTest(test.TestCase): random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 784], seed=0) conv = _two_layer_model(x) - reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keep_dims=True) + reduce_sum = math_ops.reduce_sum(conv, axis=[2, 3], keepdims=True) output = array_ops.identity(reduce_sum) with session.Session(config=_get_config(False)) as sess: diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py index 81a4d2f820a..449410fe082 100644 --- a/tensorflow/python/keras/_impl/keras/backend.py +++ b/tensorflow/python/keras/_impl/keras/backend.py @@ -3448,7 +3448,7 @@ def categorical_crossentropy(target, output, from_logits=False): Returns: Output tensor. """ - # Note: nn.softmax_cross_entropy_with_logits + # Note: nn.softmax_cross_entropy_with_logits_v2 # expects logits, Keras expects probabilities. if not from_logits: # scale preds so that the class probas of each sample sum to 1 @@ -3512,7 +3512,7 @@ def binary_crossentropy(target, output, from_logits=False): Returns: A tensor. """ - # Note: nn.softmax_cross_entropy_with_logits + # Note: nn.sigmoid_cross_entropy_with_logits # expects logits, Keras expects probabilities. if not from_logits: # transform back to logits diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py index 5462a95d7d0..c16fc07fb4e 100644 --- a/tensorflow/python/keras/_impl/keras/layers/normalization.py +++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py @@ -593,9 +593,9 @@ class BatchNormalization(Layer): # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(new_mean, - axis=1, keep_dims=True) + axis=1, keepdims=True) new_variance = math_ops.reduce_mean(new_variance, - axis=1, keep_dims=True) + axis=1, keepdims=True) def _do_update(var, value): if in_eager_mode and not self.trainable: diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index ebbec39cf3a..c03c5146994 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -917,6 +917,20 @@ tf_py_test( ], ) +tf_py_test( + name = "string_strip_op_test", + size = "small", + srcs = ["string_strip_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:string_ops", + ], +) + tf_py_test( name = "substr_op_test", size = "small", @@ -1195,6 +1209,18 @@ cuda_py_test( ], ) +cuda_py_test( + name = "broadcast_to_ops_test", + size = "small", + srcs = ["broadcast_to_ops_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client", + "//tensorflow/python:client_testlib", + ], +) + cuda_py_test( name = "inplace_ops_test", size = "small", diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py new file mode 100644 index 00000000000..6a1bd958ba8 --- /dev/null +++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py @@ -0,0 +1,85 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for broadcast_to ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test as test_lib + + +class BroadcastToTest(test_util.TensorFlowTestCase): + + def testBroadcastToBasic(self): + for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]: + with self.test_session(use_gpu=True): + x = np.array([1, 2, 3], dtype=dtype) + v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3]) + v_np = np.broadcast_to(x, [3, 3]) + self.assertAllEqual(v_tf.eval(), v_np) + + def testBroadcastToString(self): + with self.test_session(use_gpu=True): + x = np.array([b"1", b"2", b"3"]) + v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3]) + v_np = np.broadcast_to(x, [3, 3]) + self.assertAllEqual(v_tf.eval(), v_np) + + def testBroadcastToBool(self): + with self.test_session(use_gpu=True): + x = np.array([True, False, True], dtype=np.bool) + v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3]) + v_np = np.broadcast_to(x, [3, 3]) + self.assertAllEqual(v_tf.eval(), v_np) + + def testBroadcastToShape(self): + for input_dim in range(1, 6): + for output_dim in range(input_dim, 6): + with self.test_session(use_gpu=True): + input_shape = [2] * input_dim + output_shape = [2] * output_dim + x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32) + v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape) + v_np = np.broadcast_to(x, output_shape) + self.assertAllEqual(v_tf.eval(), v_np) + + def testBroadcastToScalar(self): + with self.test_session(use_gpu=True): + x = np.array(1, dtype=np.int32) + v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3]) + v_np = np.broadcast_to(x, [3, 3]) + self.assertAllEqual(v_tf.eval(), v_np) + + def testBroadcastToShapeTypeAndInference(self): + for dtype in [dtypes.int32, dtypes.int64]: + with self.test_session(use_gpu=True): + x = np.array([1, 2, 3]) + v_tf = array_ops.broadcast_to( + constant_op.constant(x), + constant_op.constant([3, 3], dtype=dtype)) + shape = v_tf.get_shape().as_list() + v_np = np.broadcast_to(x, [3, 3]) + self.assertAllEqual(v_tf.eval(), v_np) + # check shape inference when shape input is constant + self.assertAllEqual(shape, v_np.shape) + +if __name__ == "__main__": + test_lib.main() diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py index 670a625f0f1..79e419867d7 100644 --- a/tensorflow/python/kernel_tests/confusion_matrix_test.py +++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -104,11 +105,7 @@ class ConfusionMatrixTest(test.TestCase): d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0}) truth = np.zeros([2, 2], dtype=np_dtype) - try: - range_builder = xrange - except NameError: # In Python 3. - range_builder = range - for i in range_builder(len(d)): + for i in xrange(len(d)): truth[l[i], d[i]] += 1 self.assertEqual(cm_out.dtype, np_dtype) diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py index 749313b00d8..107ee37fabb 100644 --- a/tensorflow/python/kernel_tests/constant_op_test.py +++ b/tensorflow/python/kernel_tests/constant_op_test.py @@ -65,6 +65,11 @@ class ConstantTest(test.TestCase): self._testCpu(x) self._testGpu(x) + def testInvalidDType(self): + # Test case for GitHub issue 18474 + with self.assertRaises(TypeError): + constant_op.constant(dtypes_lib.string, "[,]") + def testBFloat16(self): bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16)) diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py index a8b3af50962..8973a450fa2 100644 --- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py +++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py @@ -119,6 +119,18 @@ class Conv3DTransposeTest(test.TestCase): target = 3.0 self.assertAllClose(target, value[n, d, h, w, k]) + def testConv3DTransposeShapeMismatch(self): + # Test case for GitHub issue 18460 + x_shape = [2, 2, 3, 4, 3] + f_shape = [3, 3, 3, 2, 2] + y_shape = [2, 2, 6, 8, 6] + strides = [1, 1, 2, 2, 2] + np.random.seed(1) + x_value = np.random.random_sample(x_shape).astype(np.float64) + f_value = np.random.random_sample(f_shape).astype(np.float64) + nn_ops.conv3d_transpose( + x_value, f_value, y_shape, strides, data_format='NCDHW') + def testConv3DTransposeValid(self): with self.test_session(): strides = [1, 2, 2, 2, 1] diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py index b8200ac0cb1..f31426713c4 100644 --- a/tensorflow/python/kernel_tests/manip_ops_test.py +++ b/tensorflow/python/kernel_tests/manip_ops_test.py @@ -20,8 +20,10 @@ from __future__ import print_function import numpy as np from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import manip_ops from tensorflow.python.platform import test as test_lib @@ -88,41 +90,78 @@ class RollTest(test_util.TensorFlowTestCase): x = np.random.rand(3, 2, 1, 1).astype(t) self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2]) + def testNegativeAxis(self): + self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1) + self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2) + # Make sure negative axis shoudl be 0 <= axis + dims < dims + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "is out of range"): + manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), + 3, -10).eval() + + def testInvalidInputShape(self): + # The input should be 1-D or higher, checked in shape function. + with self.assertRaisesRegexp( + ValueError, "Shape must be at least rank 1 but is rank 0"): + manip_ops.roll(7, 1, 0) + def testRollInputMustVectorHigherRaises(self): - tensor = 7 + # The input should be 1-D or higher, checked in kernel. + tensor = array_ops.placeholder(dtype=dtypes.int32) shift = 1 axis = 0 with self.test_session(): with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "input must be 1-D or higher"): - manip_ops.roll(tensor, shift, axis).eval() + manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7}) + + def testInvalidAxisShape(self): + # The axis should be a scalar or 1-D, checked in shape function. + with self.assertRaisesRegexp( + ValueError, "Shape must be at most rank 1 but is rank 2"): + manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]]) def testRollAxisMustBeScalarOrVectorRaises(self): + # The axis should be a scalar or 1-D, checked in kernel. tensor = [[1, 2], [3, 4]] shift = 1 - axis = [[0, 1]] + axis = array_ops.placeholder(dtype=dtypes.int32) with self.test_session(): with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "axis must be a scalar or a 1-D vector"): - manip_ops.roll(tensor, shift, axis).eval() + manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]}) + + def testInvalidShiftShape(self): + # The shift should be a scalar or 1-D, checked in shape function. + with self.assertRaisesRegexp( + ValueError, "Shape must be at most rank 1 but is rank 2"): + manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1) def testRollShiftMustBeScalarOrVectorRaises(self): + # The shift should be a scalar or 1-D, checked in kernel. tensor = [[1, 2], [3, 4]] - shift = [[0, 1]] + shift = array_ops.placeholder(dtype=dtypes.int32) axis = 1 with self.test_session(): with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "shift must be a scalar or a 1-D vector"): - manip_ops.roll(tensor, shift, axis).eval() + manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]}) + + def testInvalidShiftAndAxisNotEqualShape(self): + # The shift and axis must be same size, checked in shape function. + with self.assertRaisesRegexp(ValueError, "both shapes must be equal"): + manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1]) def testRollShiftAndAxisMustBeSameSizeRaises(self): + # The shift and axis must be same size, checked in kernel. tensor = [[1, 2], [3, 4]] - shift = [1] + shift = array_ops.placeholder(dtype=dtypes.int32) axis = [0, 1] with self.test_session(): with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "shift and axis must have the same size"): - manip_ops.roll(tensor, shift, axis).eval() + manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [1]}) def testRollAxisOutOfRangeRaises(self): tensor = [1, 2] diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py index d85512fae69..3f71b326a2f 100644 --- a/tensorflow/python/kernel_tests/norm_op_test.py +++ b/tensorflow/python/kernel_tests/norm_op_test.py @@ -37,17 +37,17 @@ class NormOpTest(test_lib.TestCase): def testBadOrder(self): matrix = [[0., 1.], [2., 3.]] - for ord_ in "foo", -7, -1.1, 0: + for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): - linalg_ops.norm(matrix, ord="fro") + linalg_ops.norm(matrix, ord=ord_) - for ord_ in "foo", -7, -1.1, 0: + for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): linalg_ops.norm(matrix, ord=ord_, axis=-1) - for ord_ in 1.1, 2: + for ord_ in "foo", -7, -1.1, 1.1: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported matrix norm"): linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1]) @@ -69,14 +69,14 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_): if use_static_shape_: tf_matrix = constant_op.constant(matrix) tf_norm = linalg_ops.norm( - tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_) + tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = sess.run(tf_norm) else: tf_matrix = array_ops.placeholder(dtype_) tf_norm = linalg_ops.norm( - tf_matrix, ord=ord_, axis=axis_, keep_dims=keep_dims_) + tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix}) - self.assertAllClose(np_norm, tf_norm_val) + self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5) def Test(self): is_matrix_norm = (isinstance(axis_, tuple) or @@ -85,8 +85,6 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_): if ((not is_matrix_norm and ord_ == "fro") or (is_matrix_norm and is_fancy_p_norm)): self.skipTest("Not supported by neither numpy.linalg.norm nor tf.norm") - if is_matrix_norm and ord_ == 2: - self.skipTest("Not supported by tf.norm") if ord_ == 'euclidean' or (axis_ is None and len(shape) > 2): self.skipTest("Not supported by numpy.linalg.norm") matrix = np.random.randn(*shape_).astype(dtype_) diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py index 5b508b7c0e7..b9f44d728a1 100644 --- a/tensorflow/python/kernel_tests/py_func_test.py +++ b/tensorflow/python/kernel_tests/py_func_test.py @@ -52,6 +52,38 @@ class PyFuncTest(test.TestCase): """Encapsulates tests for py_func and eager_py_func.""" # ----- Tests for py_func ----- + def testRealDataTypes(self): + def sum_func(x, y): + return x + y + for dtype in [dtypes.float16, dtypes.float32, dtypes.float64, + dtypes.uint8, dtypes.int8, dtypes.uint16, dtypes.int16, + dtypes.int32, dtypes.int64]: + with self.test_session(): + x = constant_op.constant(1, dtype=dtype) + y = constant_op.constant(2, dtype=dtype) + z = self.evaluate(script_ops.py_func(sum_func, [x, y], dtype)) + self.assertEqual(z, 3) + + def testComplexDataTypes(self): + def sub_func(x, y): + return x - y + for dtype in [dtypes.complex64, dtypes.complex128]: + with self.test_session(): + x = constant_op.constant(1 + 1j, dtype=dtype) + y = constant_op.constant(2 - 2j, dtype=dtype) + z = self.evaluate(script_ops.py_func(sub_func, [x, y], dtype)) + self.assertEqual(z, -1 + 3j) + + def testBoolDataTypes(self): + def and_func(x, y): + return x and y + dtype = dtypes.bool + with self.test_session(): + x = constant_op.constant(True, dtype=dtype) + y = constant_op.constant(False, dtype=dtype) + z = self.evaluate(script_ops.py_func(and_func, [x, y], dtype)) + self.assertEqual(z, False) + def testSingleType(self): with self.test_session(): x = constant_op.constant(1.0, dtypes.float32) diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py index a9dc7b7de00..051c7d86bf2 100644 --- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py +++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py @@ -46,7 +46,7 @@ def composed_sampler(logits, num_samples): logits = array_ops.expand_dims(logits, -1) # [batch size, num samples] - return math_ops.argmax(logits + noise, dimension=1) + return math_ops.argmax(logits + noise, axis=1) native_sampler = random_ops.multinomial diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py index df37dd98ece..e4b5c3832a2 100644 --- a/tensorflow/python/kernel_tests/random/random_ops_test.py +++ b/tensorflow/python/kernel_tests/random/random_ops_test.py @@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase): print("count = ", count) self.assertTrue(count < count_limit) + def testUniformIntsWithInvalidShape(self): + for dtype in dtypes.int32, dtypes.int64: + with self.assertRaisesRegexp( + ValueError, "Shape must be rank 0 but is rank 1"): + random_ops.random_uniform( + [1000], minval=[1, 2], maxval=3, dtype=dtype) + with self.assertRaisesRegexp( + ValueError, "Shape must be rank 0 but is rank 1"): + random_ops.random_uniform( + [1000], minval=1, maxval=[2, 3], dtype=dtype) + # Check that uniform ints actually follow a uniform distribution. def testUniformInts(self): minv = -2 diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py new file mode 100644 index 00000000000..30fd477ff42 --- /dev/null +++ b/tensorflow/python/kernel_tests/string_strip_op_test.py @@ -0,0 +1,56 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for string_strip_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class StringStripOpTest(test.TestCase): + """ Test cases for tf.string_strip.""" + + def test_string_strip(self): + strings = ["pigs on the wing", "animals"] + + with self.test_session() as sess: + output = string_ops.string_strip(strings) + output = sess.run(output) + self.assertAllEqual(output, [b"pigs on the wing", b"animals"]) + + def test_string_strip_2d(self): + strings = [["pigs on the wing", "animals"], + [" hello ", "\n\tworld \r \n"]] + + with self.test_session() as sess: + output = string_ops.string_strip(strings) + output = sess.run(output) + self.assertAllEqual(output, [[b"pigs on the wing", b"animals"], + [b"hello", b"world"]]) + + def test_string_strip_with_empty_strings(self): + strings = [" hello ", "", "world ", " \t \r \n "] + + with self.test_session() as sess: + output = string_ops.string_strip(strings) + output = sess.run(output) + self.assertAllEqual(output, [b"hello", b"", b"world", b""]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc index 22317a348c9..8c6bb7955a4 100644 --- a/tensorflow/python/lib/core/py_func.cc +++ b/tensorflow/python/lib/core/py_func.cc @@ -126,6 +126,9 @@ Status NumericNpDTypeToTfDType(const int np, DataType* tf) { case NPY_INT8: *tf = DT_INT8; break; + case NPY_UINT16: + *tf = DT_UINT16; + break; case NPY_INT16: *tf = DT_INT16; break; diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index fa26e07c853..ceeabe090df 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -144,6 +144,7 @@ def identity(input, name=None): # pylint: disable=redefined-builtin # pylint: disable=redefined-builtin,protected-access @tf_export("expand_dims") +@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim") def expand_dims(input, axis=None, name=None, dim=None): """Inserts a dimension of 1 into a tensor's shape. @@ -193,11 +194,7 @@ def expand_dims(input, axis=None, name=None, dim=None): Raises: ValueError: if both `dim` and `axis` are specified. """ - # TODO(aselle): Remove argument dim - if dim is not None: - if axis is not None: - raise ValueError("can't specify both 'dim' and 'axis'") - axis = dim + axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim) return gen_array_ops.expand_dims(input, axis, name) @@ -2581,6 +2578,8 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None): @tf_export("squeeze") +@deprecation.deprecated_args(None, "Use the `axis` argument instead", + "squeeze_dims") def squeeze(input, axis=None, name=None, squeeze_dims=None): # pylint: disable=redefined-builtin """Removes dimensions of size 1 from the shape of a tensor. @@ -2621,10 +2620,8 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None): Raises: ValueError: When both `squeeze_dims` and `axis` are specified. """ - if squeeze_dims is not None: - if axis is not None: - raise ValueError("Cannot specify both 'squeeze_dims' and 'axis'") - axis = squeeze_dims + axis = deprecation.deprecated_argument_lookup( + "axis", axis, "squeeze_dims", squeeze_dims) if np.isscalar(axis): axis = [axis] return gen_array_ops.squeeze(input, axis, name) diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py index 66fa9e110c1..8f25b1149c3 100644 --- a/tensorflow/python/ops/distributions/categorical.py +++ b/tensorflow/python/ops/distributions/categorical.py @@ -311,7 +311,7 @@ class Categorical(distribution.Distribution): nn_ops.log_softmax(self.logits) * self.probs, axis=-1) def _mode(self): - ret = math_ops.argmax(self.logits, dimension=self._batch_rank) + ret = math_ops.argmax(self.logits, axis=self._batch_rank) ret = math_ops.cast(ret, self.dtype) ret.set_shape(self.batch_shape) return ret diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py index f0120f2957d..9e46739bc1b 100644 --- a/tensorflow/python/ops/embedding_ops.py +++ b/tensorflow/python/ops/embedding_ops.py @@ -331,11 +331,11 @@ def embedding_lookup_sparse(params, representing sharded embedding tensors. Alternatively, a `PartitionedVariable`, created by partitioning along dimension 0. Each element must be appropriately sized for the given `partition_strategy`. - sp_ids: N x M SparseTensor of int64 ids (typically from FeatureValueToId), + sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId), where N is typically batch size and M is arbitrary. - sp_weights: either a SparseTensor of float / double weights, or None to - indicate all weights should be taken to be 1. If specified, sp_weights - must have exactly the same shape and indices as sp_ids. + sp_weights: either a `SparseTensor` of float / double weights, or `None` to + indicate all weights should be taken to be 1. If specified, `sp_weights` + must have exactly the same shape and indices as `sp_ids`. partition_strategy: A string specifying the partitioning strategy, relevant if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. @@ -351,39 +351,43 @@ def embedding_lookup_sparse(params, Returns: A dense tensor representing the combined embeddings for the - sparse ids. For each row in the dense tensor represented by sp_ids, the op + sparse ids. For each row in the dense tensor represented by `sp_ids`, the op looks up the embeddings for all ids in that row, multiplies them by the corresponding weight, and combines these embeddings as specified. In other words, if - shape(combined params) = [p0, p1, ..., pm] + `shape(combined params) = [p0, p1, ..., pm]` and - shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn] + `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]` then - shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]. + `shape(output) = [d0, d1, ..., dn-1, p1, ..., pm]`. For instance, if params is a 10x20 matrix, and sp_ids / sp_weights are + ```python [0, 0]: id 1, weight 2.0 [0, 1]: id 3, weight 0.5 [1, 0]: id 0, weight 1.0 [2, 3]: id 1, weight 3.0 + ``` with `combiner`="mean", then the output will be a 3x20 matrix where + ```python output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5) output[1, :] = (params[0, :] * 1.0) / 1.0 output[2, :] = (params[1, :] * 3.0) / 3.0 + ``` Raises: - TypeError: If sp_ids is not a SparseTensor, or if sp_weights is neither - None nor SparseTensor. - ValueError: If combiner is not one of {"mean", "sqrtn", "sum"}. + TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is + neither `None` nor `SparseTensor`. + ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}. """ if combiner is None: logging.warn("The default value of combiner will change from \"mean\" " diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py index 4a1ef54fb50..ec38d89a0ec 100644 --- a/tensorflow/python/ops/histogram_ops.py +++ b/tensorflow/python/ops/histogram_ops.py @@ -32,7 +32,6 @@ from tensorflow.python.ops import clip_ops from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import tf_export -from tensorflow.python.util.tf_export import tf_export @tf_export('histogram_fixed_width_bins') diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 3369fe3c9b3..601010bce9e 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -269,17 +269,7 @@ def random_flip_up_down(image, seed=None): Raises: ValueError: if the shape of `image` not supported. """ - with ops.name_scope(None, 'random_flip_up_down', [image]) as scope: - image = ops.convert_to_tensor(image, name='image') - image = _Assert3DImage(image) - uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed) - mirror_cond = math_ops.less(uniform_random, .5) - result = control_flow_ops.cond( - mirror_cond, - lambda: array_ops.reverse(image, [0]), - lambda: image, - name=scope) - return fix_image_flip_shape(image, result) + return _random_flip(image, 0, seed, 'random_flip_up_down') @tf_export('image.random_flip_left_right') @@ -301,14 +291,34 @@ def random_flip_left_right(image, seed=None): Raises: ValueError: if the shape of `image` not supported. """ - with ops.name_scope(None, 'random_flip_left_right', [image]) as scope: + return _random_flip(image, 1, seed, 'random_flip_left_right') + + +def _random_flip(image, flip_index, seed, scope_name): + """Randomly (50% chance) flip an image along axis `flip_index`. + Args: + image: A 3-D tensor of shape `[height, width, channels].` + flip_index: The dimension along which to flip the image. + Vertical: 0, Horizontal: 1 + seed: A Python integer. Used to create a random seed. See + @{tf.set_random_seed} + for behavior. + scope_name: Name of the scope in which the ops are added. + + Returns: + A 3-D tensor of the same type and shape as `image`. + + Raises: + ValueError: if the shape of `image` not supported. + """ + with ops.name_scope(None, scope_name, [image]) as scope: image = ops.convert_to_tensor(image, name='image') image = _Assert3DImage(image) uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed) mirror_cond = math_ops.less(uniform_random, .5) result = control_flow_ops.cond( mirror_cond, - lambda: array_ops.reverse(image, [1]), + lambda: array_ops.reverse(image, [flip_index]), lambda: image, name=scope) return fix_image_flip_shape(image, result) @@ -332,16 +342,7 @@ def flip_left_right(image): Raises: ValueError: if the shape of `image` not supported. """ - with ops.name_scope(None, 'flip_left_right', [image]): - image = ops.convert_to_tensor(image, name='image') - image = _AssertAtLeast3DImage(image) - shape = image.get_shape() - if shape.ndims == 3 or shape.ndims is None: - return fix_image_flip_shape(image, array_ops.reverse(image, [1])) - elif shape.ndims == 4: - return array_ops.reverse(image, [2]) - else: - raise ValueError('\'image\' must have either 3 or 4 dimensions.') + return _flip(image, 1, 'flip_left_right') @tf_export('image.flip_up_down') @@ -362,14 +363,35 @@ def flip_up_down(image): Raises: ValueError: if the shape of `image` not supported. """ - with ops.name_scope(None, 'flip_up_down', [image]): + return _flip(image, 0, 'flip_up_down') + + +def _flip(image, flip_index, scope_name): + """Flip an image either horizontally or vertically. + + Outputs the contents of `image` flipped along the dimension `flip_index`. + + See also `reverse()`. + + Args: + image: 4-D Tensor of shape `[batch, height, width, channels]` or + 3-D Tensor of shape `[height, width, channels]`. + flip_index: 0 For vertical, 1 for horizontal. + + Returns: + A tensor of the same type and shape as `image`. + + Raises: + ValueError: if the shape of `image` not supported. + """ + with ops.name_scope(None, scope_name, [image]): image = ops.convert_to_tensor(image, name='image') image = _AssertAtLeast3DImage(image) shape = image.get_shape() if shape.ndims == 3 or shape.ndims is None: - return fix_image_flip_shape(image, array_ops.reverse(image, [0])) + return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index])) elif shape.ndims == 4: - return array_ops.reverse(image, [1]) + return array_ops.reverse(image, [flip_index+1]) else: raise ValueError('\'image\' must have either 3 or 4 dimensions.') diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py index 39b72951249..f93bf0a17f3 100644 --- a/tensorflow/python/ops/init_ops.py +++ b/tensorflow/python/ops/init_ops.py @@ -39,10 +39,10 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops -from tensorflow.python.ops import linalg_ops +from tensorflow.python.ops import linalg_ops_impl +from tensorflow.python.ops import gen_linalg_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops -from tensorflow.python.ops import random_ops from tensorflow.python.util.deprecation import deprecated from tensorflow.python.util.tf_export import tf_export @@ -529,7 +529,7 @@ class Orthogonal(Initializer): # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization - q, r = linalg_ops.qr(a, full_matrices=False) + q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) @@ -577,7 +577,7 @@ class ConvolutionDeltaOrthogonal(Initializer): a = random_ops.random_normal([shape[-1], shape[-1]], dtype=dtype, seed=self.seed) # Compute the qr factorization - q, r = linalg_ops.qr(a, full_matrices=False) + q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) @@ -636,7 +636,7 @@ class ConvolutionOrthogonal(Initializer): a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed) if self.seed: self.seed += 1 - q, r = linalg_ops.qr(a) + q, r = gen_linalg_ops.qr(a) d = array_ops.diag_part(r) # make q uniform q *= math_ops.sign(d) @@ -723,7 +723,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal): raise ValueError("The dimension of the matrices must be the same.") n = p1.shape.as_list()[0] kernel2x2 = {} - eye = linalg_ops.eye(n, dtype=self.dtype) + eye = linalg_ops_impl.eye(n, dtype=self.dtype) kernel2x2[0, 0] = math_ops.matmul(p1, p2) kernel2x2[0, 1] = math_ops.matmul(p1, (eye - p2)) kernel2x2[1, 0] = math_ops.matmul((eye - p1), p2) @@ -848,7 +848,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal): """ n = projection_matrix.shape.as_list()[0] kernel = {} - eye = linalg_ops.eye(n, dtype=self.dtype) + eye = linalg_ops_impl.eye(n, dtype=self.dtype) kernel[0] = projection_matrix kernel[1] = eye - projection_matrix return kernel @@ -976,7 +976,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal): if p1_shape != p2.shape.as_list() or p1_shape != p3.shape.as_list(): raise ValueError("The dimension of the matrices must be the same.") n = p1_shape[0] - eye = linalg_ops.eye(n, dtype=self.dtype) + eye = linalg_ops_impl.eye(n, dtype=self.dtype) kernel2x2x2 = {} def matmul(p1, p2, p3): return math_ops.matmul(math_ops.matmul(p1, p2), p3) @@ -1084,7 +1084,7 @@ class Identity(Initializer): "Identity matrix initializer can only be used for 2D matrices.") if dtype is None: dtype = self.dtype - initializer = linalg_ops.eye(*full_shape, dtype=dtype) + initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype) if partition_info is not None: initializer = array_ops.slice(initializer, partition_info.var_offset, shape) diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py index 170861b43fd..a0dfa543f9b 100644 --- a/tensorflow/python/ops/linalg_ops.py +++ b/tensorflow/python/ops/linalg_ops.py @@ -24,12 +24,13 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import functional_ops from tensorflow.python.ops import gen_linalg_ops +from tensorflow.python.ops import linalg_ops_impl from tensorflow.python.ops import math_ops # pylint: disable=wildcard-import from tensorflow.python.ops.gen_linalg_ops import * # pylint: enable=wildcard-import -from tensorflow.python.util import compat from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export @@ -159,36 +160,11 @@ def eye(num_rows, Returns: A `Tensor` of shape `batch_shape + [num_rows, num_columns]` """ - with ops.name_scope( - name, default_name='eye', values=[num_rows, num_columns, batch_shape]): - is_square = num_columns is None - batch_shape = [] if batch_shape is None else batch_shape - num_columns = num_rows if num_columns is None else num_columns - if isinstance(num_rows, ops.Tensor) or isinstance( - num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor): - batch_shape = ops.convert_to_tensor( - batch_shape, name='shape', dtype=dtypes.int32) - diag_size = math_ops.minimum(num_rows, num_columns) - diag_shape = array_ops.concat((batch_shape, [diag_size]), 0) - if not is_square: - shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0) - else: - if not isinstance(num_rows, compat.integral_types) or not isinstance( - num_columns, compat.integral_types): - raise TypeError( - 'num_rows and num_columns must be positive integer values.') - batch_shape = [dim for dim in batch_shape] - is_square = num_rows == num_columns - diag_shape = batch_shape + [np.minimum(num_rows, num_columns)] - if not is_square: - shape = batch_shape + [num_rows, num_columns] - - diag_ones = array_ops.ones(diag_shape, dtype=dtype) - if is_square: - return array_ops.matrix_diag(diag_ones) - else: - zero_matrix = array_ops.zeros(shape, dtype=dtype) - return array_ops.matrix_set_diag(zero_matrix, diag_ones) + return linalg_ops_impl.eye(num_rows, + num_columns=num_columns, + batch_shape=batch_shape, + dtype=dtype, + name=name) @tf_export('matrix_solve_ls', 'linalg.lstsq') @@ -454,7 +430,7 @@ def norm(tensor, This function can compute several different vector norms (the 1-norm, the Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and - matrix norms (Frobenius, 1-norm, and inf-norm). + matrix norms (Frobenius, 1-norm, 2-norm and inf-norm). Args: tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128` @@ -465,7 +441,7 @@ def norm(tensor, Some restrictions apply: a) The Frobenius norm `fro` is not defined for vectors, b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`, - `np.inf` are supported. + `2`, `np.inf` are supported. See the description of `axis` on how to compute norms for a batch of vectors or matrices stored in a tensor. axis: If `axis` is `None` (the default), the input is considered a vector @@ -521,8 +497,7 @@ def norm(tensor, axis[0] == axis[1]): raise ValueError( "'axis' must be None, an integer, or a tuple of 2 unique integers") - # TODO(rmlarsen): Implement matrix 2-norm using tf.svd(). - supported_matrix_norms = ['euclidean', 'fro', 1, np.inf] + supported_matrix_norms = ['euclidean', 'fro', 1, 2, np.inf] if ord not in supported_matrix_norms: raise ValueError("'ord' must be a supported matrix norm in %s, got %s" % (supported_matrix_norms, ord)) @@ -539,12 +514,34 @@ def norm(tensor, with ops.name_scope(name, 'norm', [tensor]): tensor = ops.convert_to_tensor(tensor) + if ord in ['fro', 'euclidean', 2, 2.0]: - # TODO(rmlarsen): Move 2-norm to a separate clause once we support it for - # matrices. - result = math_ops.sqrt( - math_ops.reduce_sum( - tensor * math_ops.conj(tensor), axis, keepdims=True)) + if is_matrix_norm and ord in [2, 2.0]: + rank = array_ops.rank(tensor) + positive_axis = functional_ops.map_fn( + lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank), + ops.convert_to_tensor(axis)) + axes = math_ops.range(rank) + perm_before = array_ops.concat( + [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis], + axis=0) + perm_after = functional_ops.map_fn( + lambda i: math_ops.cast( + array_ops.squeeze( + array_ops.where(math_ops.equal(perm_before, i))), + dtype=dtypes.int32), axes) + permed = array_ops.transpose(tensor, perm=perm_before) + matrix_2_norm = array_ops.expand_dims( + math_ops.reduce_max( + math_ops.abs(gen_linalg_ops.svd(permed, compute_uv=False)[0]), + axis=-1, + keepdims=True), + axis=-1) + result = array_ops.transpose(matrix_2_norm, perm=perm_after) + else: + result = math_ops.sqrt( + math_ops.reduce_sum( + tensor * math_ops.conj(tensor), axis, keepdims=True)) else: result = math_ops.abs(tensor) if ord == 1: diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py new file mode 100644 index 00000000000..e7c89f6ae3e --- /dev/null +++ b/tensorflow/python/ops/linalg_ops_impl.py @@ -0,0 +1,73 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Operations for linear algebra.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.util import compat + +# Names below are lower_case. +# pylint: disable=invalid-name + + +def eye(num_rows, + num_columns=None, + batch_shape=None, + dtype=dtypes.float32, + name=None): + """Construct an identity matrix, or a batch of matrices. + + See `linalg_ops.eye`. + """ + with ops.name_scope( + name, default_name='eye', values=[num_rows, num_columns, batch_shape]): + is_square = num_columns is None + batch_shape = [] if batch_shape is None else batch_shape + num_columns = num_rows if num_columns is None else num_columns + if isinstance(num_rows, ops.Tensor) or isinstance( + num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor): + batch_shape = ops.convert_to_tensor( + batch_shape, name='shape', dtype=dtypes.int32) + diag_size = math_ops.minimum(num_rows, num_columns) + diag_shape = array_ops.concat((batch_shape, [diag_size]), 0) + if not is_square: + shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0) + else: + if not isinstance(num_rows, compat.integral_types) or not isinstance( + num_columns, compat.integral_types): + raise TypeError( + 'num_rows and num_columns must be positive integer values.') + batch_shape = [dim for dim in batch_shape] + is_square = num_rows == num_columns + diag_shape = batch_shape + [np.minimum(num_rows, num_columns)] + if not is_square: + shape = batch_shape + [num_rows, num_columns] + + diag_ones = array_ops.ones(diag_shape, dtype=dtype) + if is_square: + return array_ops.matrix_diag(diag_ones) + else: + zero_matrix = array_ops.zeros(shape, dtype=dtype) + return array_ops.matrix_set_diag(zero_matrix, diag_ones) + +# pylint: enable=invalid-name,redefined-builtin diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py index 34ca1adc3e1..9fc545c9678 100644 --- a/tensorflow/python/ops/losses/losses_impl.py +++ b/tensorflow/python/ops/losses/losses_impl.py @@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.ops import weights_broadcast_ops from tensorflow.python.ops.losses import util from tensorflow.python.util.deprecation import deprecated_args +from tensorflow.python.util.deprecation import deprecated_argument_lookup from tensorflow.python.util.tf_export import tf_export @@ -306,11 +307,8 @@ def cosine_distance( ValueError: If `predictions` shape doesn't match `labels` shape, or `axis`, `labels`, `predictions` or `weights` is `None`. """ - if dim is not None: - if axis is not None: - raise ValueError("Cannot specify both 'axis' and 'dim'") - axis = dim - if axis is None and dim is None: + axis = deprecated_argument_lookup("axis", axis, "dim", dim) + if axis is None: raise ValueError("You must specify 'axis'.") if labels is None: raise ValueError("labels must not be None.") @@ -696,7 +694,7 @@ def softmax_cross_entropy( onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): - """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. + """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a @@ -707,11 +705,16 @@ def softmax_cross_entropy( new_onehot_labels = onehot_labels * (1 - label_smoothing) + label_smoothing / num_classes + Note that `onehot_labels` and `logits` must have the same shape, + e.g. `[batch_size, num_classes]`. The shape of `weights` must be + broadcastable to loss, whose shape is decided by the shape of `logits`. + In case the shape of `logits` is `[batch_size, num_classes]`, loss is + a `Tensor` of shape `[batch_size]`. + Args: - onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels. - logits: `[batch_size, num_classes]` logits outputs of the network . - weights: Optional `Tensor` whose rank is either 0, or rank 1 and is - broadcastable to the loss which is a `Tensor` of shape `[batch_size]`. + onehot_labels: One-hot-encoded labels. + logits: Logits outputs of the network. + weights: Optional `Tensor` that is broadcastable to loss. label_smoothing: If greater than 0 then smooth the labels. scope: the scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 2b04866fef4..2feb88cb7bc 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -211,11 +211,9 @@ def argmax(input, name=None, dimension=None, output_type=dtypes.int64): - if dimension is not None: - if axis is not None: - raise ValueError("Cannot specify both 'axis' and 'dimension'") - axis = dimension - elif axis is None: + axis = deprecation.deprecated_argument_lookup( + "axis", axis, "dimension", dimension) + if axis is None: axis = 0 return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type) @@ -231,11 +229,9 @@ def argmin(input, name=None, dimension=None, output_type=dtypes.int64): - if dimension is not None: - if axis is not None: - raise ValueError("Cannot specify both 'axis' and 'dimension'") - axis = dimension - elif axis is None: + axis = deprecation.deprecated_argument_lookup( + "axis", axis, "dimension", dimension) + if axis is None: axis = 0 return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type) @@ -761,13 +757,25 @@ def cast(x, dtype, name=None): tf.cast(x, tf.int32) # [1, 2], dtype=tf.int32 ``` + The operation supports data types (for `x` and `dtype`) of + `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, `float16`, `float32`, + `float64`, `complex64`, `complex128`, `bfloat16`. In case of casting from + complex types (`complex64`, `complex128`) to real types, only the real part + of `x` is returned. In case of casting from real types to complex types + (`complex64`, `complex128`), the imaginary part of the returned value is set + to `0`. The handling of complex types here matches the behavior of numpy. + Args: - x: A `Tensor` or `SparseTensor`. - dtype: The destination type. + x: A `Tensor` or `SparseTensor` of numeric type. It could be + `uint8`, `int8`, `uint16`, `int16`, `int32`, `int64`, + `float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`. + dtype: The destination type. The list of supported dtypes is the same + as `x`. name: A name for the operation (optional). Returns: - A `Tensor` or `SparseTensor` with same shape as `x`. + A `Tensor` or `SparseTensor` with same shape as `x` and + same type as `dtype`. Raises: TypeError: If `x` cannot be cast to the `dtype`. @@ -1634,7 +1642,7 @@ def reduce_min(input_tensor, tensor with a single element is returned. Args: - input_tensor: The tensor to reduce. Should have numeric type. + input_tensor: The tensor to reduce. Should have real numeric type. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. @@ -1683,7 +1691,7 @@ def reduce_max(input_tensor, tensor with a single element is returned. Args: - input_tensor: The tensor to reduce. Should have numeric type. + input_tensor: The tensor to reduce. Should have real numeric type. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py index 244702d13be..1d0d9a52a12 100644 --- a/tensorflow/python/ops/nn.py +++ b/tensorflow/python/ops/nn.py @@ -98,6 +98,7 @@ See the @{$python/nn} guide. @@fixed_unigram_candidate_sampler @@compute_accidental_hits @@quantized_conv2d +@@quantized_relu @@quantized_relu_x @@quantized_max_pool @@quantized_avg_pool diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py index 47cc4da7f2a..d0d5ed07ced 100644 --- a/tensorflow/python/ops/nn_impl.py +++ b/tensorflow/python/ops/nn_impl.py @@ -987,7 +987,7 @@ def _compute_sampled_logits(weights, class biases. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from - the `labels` argument of `nn.softmax_cross_entropy_with_logits`. + the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. num_sampled: An `int`. The number of classes to randomly sample per batch. @@ -1012,7 +1012,7 @@ def _compute_sampled_logits(weights, out_logits: `Tensor` object with shape `[batch_size, num_true + num_sampled]`, for passing to either `nn.sigmoid_cross_entropy_with_logits` (NCE) or - `nn.softmax_cross_entropy_with_logits` (sampled softmax). + `nn.softmax_cross_entropy_with_logits_v2` (sampled softmax). out_labels: A Tensor object with the same shape as `out_logits`. """ @@ -1285,7 +1285,7 @@ def sampled_softmax_loss(weights, logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) labels_one_hot = tf.one_hot(labels, n_classes) - loss = tf.nn.softmax_cross_entropy_with_logits( + loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_one_hot, logits=logits) ``` @@ -1303,7 +1303,7 @@ def sampled_softmax_loss(weights, biases: A `Tensor` of shape `[num_classes]`. The class biases. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from - the `labels` argument of `nn.softmax_cross_entropy_with_logits`. + the `labels` argument of `nn.softmax_cross_entropy_with_logits_v2`. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. num_sampled: An `int`. The number of classes to randomly sample per batch. @@ -1340,7 +1340,8 @@ def sampled_softmax_loss(weights, partition_strategy=partition_strategy, name=name, seed=seed) - sampled_losses = nn_ops.softmax_cross_entropy_with_logits( + labels = array_ops.stop_gradient(labels, name="labels_stop_gradient") + sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2( labels=labels, logits=logits) # sampled_losses is a [batch_size] tensor. return sampled_losses diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index bb454b3c3a7..cd07550d2ee 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1155,7 +1155,7 @@ def atrous_conv2d(value, filters, rate, padding, name=None): Returns: A `Tensor` with the same type as `value`. - Output shape with `'VALID`` padding is: + Output shape with `'VALID'` padding is: [batch, height - 2 * (filter_width - 1), width - 2 * (filter_height - 1), out_channels]. @@ -1458,10 +1458,10 @@ def conv3d_transpose( if isinstance(output_shape, (list, np.ndarray)): # output_shape's shape should be == [5] if reached this point. - if not filter.get_shape()[3].is_compatible_with(output_shape[4]): + if not filter.get_shape()[3].is_compatible_with(output_shape[axis]): raise ValueError( "output_shape does not match filter's output channels, " - "{} != {}".format(output_shape[4], + "{} != {}".format(output_shape[axis], filter.get_shape()[3])) if padding != "VALID" and padding != "SAME": @@ -1986,7 +1986,7 @@ def sparse_softmax_cross_entropy_with_logits( must provide a single specific index for the true class for each row of `logits` (each minibatch entry). For soft softmax classification with a probability distribution for each entry, see - `softmax_cross_entropy_with_logits`. + `softmax_cross_entropy_with_logits_v2`. **WARNING:** This op expects unscaled logits, since it performs a `softmax` on `logits` internally for efficiency. Do not call this op with the diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index 9251e9802c5..86dc053c0fb 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -617,9 +617,9 @@ class BasicLSTMCell(LayerRNNCell): Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped - `[batch_size, self.state_size]`, if `state_is_tuple` has been set to + `[batch_size, num_units]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped - `[batch_size, 2 * self.state_size]`. + `[batch_size, 2 * num_units]`. Returns: A pair containing the new hidden state, and the new state (either a diff --git a/tensorflow/python/profiler/tfprof_logger_test.py b/tensorflow/python/profiler/tfprof_logger_test.py index 141144f9877..caf3869f56d 100644 --- a/tensorflow/python/profiler/tfprof_logger_test.py +++ b/tensorflow/python/profiler/tfprof_logger_test.py @@ -38,7 +38,7 @@ class TFProfLoggerTest(test.TestCase): return math_ops.matmul(a, b) # pylint: disable=pointless-string-statement - """# TODO(xpan): This this out of core so it doesn't depend on contrib. + """# TODO(xpan): This out of core so it doesn't depend on contrib. def testFillMissingShape(self): a, b, y = self._BuildSmallPlaceholderlModel() run_options = config_pb2.RunOptions( diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index b88be4ae04d..73ea85ab0c4 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -41,6 +41,7 @@ from tensorflow.python.debug.wrappers import local_cli_wrapper from tensorflow.python.framework import meta_graph as meta_graph_lib from tensorflow.python.framework import ops as ops_lib from tensorflow.python.platform import app # pylint: disable=unused-import +from tensorflow.python.lib.io import file_io from tensorflow.python.saved_model import loader from tensorflow.python.tools import saved_model_utils @@ -543,7 +544,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str, input_examples = preprocess_input_examples_arg_string(input_examples_str) for input_tensor_key, (filename, variable_name) in inputs.items(): - data = np.load(filename) + data = np.load(file_io.FileIO(filename, mode='r')) # When a variable_name key is specified for the input file if variable_name: diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py index 3867c0d8daa..70495291bc5 100644 --- a/tensorflow/python/training/saver_test.py +++ b/tensorflow/python/training/saver_test.py @@ -2731,7 +2731,7 @@ class ScopedGraphTest(test.TestCase): # The rest of the variables. rest_variables = list( set(variables.global_variables()) - set(var_list.keys())) - init_rest_op = variables.initialize_variables(rest_variables) + init_rest_op = variables.variables_initializer(rest_variables) with self.test_session(graph=graph) as sess: saver = saver_module.Saver(var_list=var_list, max_to_keep=1) diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py index 4163fcac79e..3358ffe5264 100644 --- a/tensorflow/python/util/compat.py +++ b/tensorflow/python/util/compat.py @@ -42,10 +42,8 @@ import six as _six from tensorflow.python.util.all_util import remove_undocumented from tensorflow.python.util.tf_export import tf_export -from tensorflow.python.util.tf_export import tf_export -@tf_export('compat.as_bytes', 'compat.as_str') def as_bytes(bytes_or_text, encoding='utf-8'): """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text. @@ -68,7 +66,6 @@ def as_bytes(bytes_or_text, encoding='utf-8'): (bytes_or_text,)) -@tf_export('compat.as_text') def as_text(bytes_or_text, encoding='utf-8'): """Returns the given argument as a unicode string. @@ -93,8 +90,12 @@ def as_text(bytes_or_text, encoding='utf-8'): # Convert an object to a `str` in both Python 2 and 3. if _six.PY2: as_str = as_bytes + tf_export('compat.as_bytes', 'compat.as_str')(as_bytes) + tf_export('compat.as_text')(as_text) else: as_str = as_text + tf_export('compat.as_bytes')(as_bytes) + tf_export('compat.as_text', 'compat.as_str')(as_text) @tf_export('compat.as_str_any') diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 640f270323c..102419a2649 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -524,11 +524,12 @@ port::Status CudnnSupport::Init() { ToString(status))}; } -port::StatusOr> CudnnSupport::GetVersion() { +port::StatusOr +CudnnSupport::GetVersion() { CudnnVersion version; TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version)); - return std::make_tuple(version.major_version, version.minor_version, - version.patch_level); + return perftools::gputools::dnn::VersionInfo( + version.major_version, version.minor_version, version.patch_level); } // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope. diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index e6d12bfef98..5ded7cf1543 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -45,7 +45,7 @@ class CudnnSupport : public dnn::DnnSupport { ~CudnnSupport() override; port::Status Init() override; - port::StatusOr> GetVersion() override; + port::StatusOr GetVersion() override; port::StatusOr> createRnnDescriptor( int num_layers, int hidden_size, int input_size, diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index fedf4f53b85..71cab145b9b 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -37,14 +37,6 @@ limitations under the License. #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/lib/inlined_vector.h" -#if defined(PLATFORM_WINDOWS) -// TODO: in windows ARRAYSIZE is defined in winnt.h but including it -// here creates a conflict with cuda.h - for now define it here. -#define ARRAYSIZE(a) \ - ((sizeof(a) / sizeof(*(a))) / \ - static_cast(!(sizeof(a) % sizeof(*(a))))) -#endif - bool FLAGS_gpuexec_cuda_driver_inject_init_error = false; bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false; bool FLAGS_gpuexec_cuda_device_0_only = false; @@ -719,15 +711,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { port::bit_cast(uintptr_t(info_log_buffer_bytes)), port::bit_cast(info_log_buffer.data()), port::bit_cast(uintptr_t(log_verbose))}; - CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values)); + CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values)); CUresult res; { // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their // module loading: see http://b/13248943 - res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options, - option_values); + res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options), + options, option_values); } // The PTX JIT mutates the values in the option values array to reflect the diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 9700daca890..7c87d33d21b 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -1126,7 +1126,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const { builder.set_name(device_name); } - for (size_t i = 0; i < ARRAYSIZE(kAllUnqueryableDeviceParams); i++) { + for (size_t i = 0; i < TF_ARRAYSIZE(kAllUnqueryableDeviceParams); i++) { const auto ¶ms = kAllUnqueryableDeviceParams[i]; if (params.cc_major == cc_major_ && params.cc_minor == cc_minor_) { builder.set_blocks_per_core_limit(params.blocks_per_core_limit); diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 8e202d115a5..39f21d8b105 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -875,6 +875,22 @@ enum class ElementwiseOperation { kAdd, kMultiply }; string ElementwiseOperationString(ElementwiseOperation op); +// A simple class representing the version of the backing library, to +// workaround the "too perfect forwarding" issue in gcc6+ compilers. +// See PR#16309 and issue #18402 for links discussing the issue. +class VersionInfo { + public: + VersionInfo(int major = 0, int minor = 0, int patch = 0) + : major_(major), minor_(minor), patch_(patch) {} + int major_version() { return major_; } + int minor_version() { return minor_; } + int patch() { return patch_; } + private: + int major_; + int minor_; + int patch_; +}; + // Suite of operations typically used for implementing Deep/Convolutional Neural // Nets. Note: A false return value of an operation indicates the // implementation is not available. @@ -885,8 +901,8 @@ class DnnSupport { virtual port::Status Init() = 0; - // Gets the version of the backing library, as a {major, minor, patch} tuple. - virtual port::StatusOr> GetVersion() { + // Gets the version of the backing library, as a VersionInfo object. + virtual port::StatusOr GetVersion() { return port::UnimplementedError( "DnnSupport::GetVersion not implemented on this platform."); } diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h index 259cf380d6c..57ad965ef11 100644 --- a/tensorflow/stream_executor/platform/port.h +++ b/tensorflow/stream_executor/platform/port.h @@ -38,12 +38,6 @@ using tensorflow::uint64; using std::string; #endif -#if !defined(COMPILER_MSVC) -#define ARRAYSIZE(a) \ - ((sizeof(a) / sizeof(*(a))) / \ - static_cast(!(sizeof(a) % sizeof(*(a))))) -#endif - using tensorflow::LinkerInitialized; using tensorflow::LINKER_INITIALIZED; diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 528f811b40a..51e856bed0e 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -163,7 +163,6 @@ def if_override_eigen_strong_inline(a): def get_win_copts(is_external=False): WINDOWS_COPTS = [ - "/D__VERSION__=\\\"MSVC\\\"", "/DPLATFORM_WINDOWS", "/DEIGEN_HAS_C99_MATH", "/DTENSORFLOW_USE_EIGEN_THREADPOOL", @@ -1704,7 +1703,7 @@ def tf_version_info_genrule(): ], outs=["util/version_info.cc"], cmd= - "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"", + "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}", local=1, tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],) diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt index 05e603efb7c..c8da55d8021 100644 --- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt @@ -6,6 +6,10 @@ tf_class { name: "cluster_spec" mtype: "" } + member { + name: "device_fn" + mtype: "" + } member { name: "evaluation_master" mtype: "" @@ -84,7 +88,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'\', \'\', \'None\', \'5\', \'10000\', \'100\', \'None\'], " + argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'\', \'\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\'], " } member_method { name: "replace" diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt index c66249999f6..0b12bc060ef 100644 --- a/tensorflow/tools/api/golden/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.pbtxt @@ -1980,6 +1980,10 @@ tf_module { name: "string_split" argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], " } + member_method { + name: "string_strip" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "string_to_hash_bucket" argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh index 82042b93c02..5fa75e1d61c 100755 --- a/tensorflow/tools/ci_build/builds/pip.sh +++ b/tensorflow/tools/ci_build/builds/pip.sh @@ -123,6 +123,10 @@ done BAZEL_FLAGS=$(str_strip "${BAZEL_FLAGS}") +if [[ -z "$GIT_TAG_OVERRIDE" ]]; then + BAZEL_FLAGS+=" --action_env=GIT_TAG_OVERRIDE" +fi + echo "Using Bazel flags: ${BAZEL_FLAGS}" PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package" diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh index caa3a40817c..c342367bace 100755 --- a/tensorflow/tools/ci_build/builds/test_user_ops.sh +++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh @@ -213,27 +213,34 @@ USER_OP=$(echo "${USER_OP_SO}" | sed -e 's/\.so//') echo "Invoking user op ${USER_OP} defined in file ${USER_OP_SO} "\ "via pip installation" -ORIG_OUTPUT=$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +function run_op() { + local ORIG_OUTPUT=$1 + local ADDITIONAL_LOG=$2 -# Format OUTPUT for analysis -if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then - if [[ ${IS_MAC} == "1" ]]; then - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + # Format OUTPUT for analysis + if [[ -z $(echo "${ORIG_OUTPUT}" | grep -o ',') ]]; then + if [[ ${IS_MAC} == "1" ]]; then + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -E -e 's/[ \t]+/,/g') + else + local OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + fi else - OUTPUT=$(echo "${ORIG_OUTPUT}" | sed -r -e 's/[ \t]+/,/g') + local OUTPUT="${ORIG_OUTPUT}" fi -else - OUTPUT="${ORIG_OUTPUT}" -fi -EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") + local EQUALS_EXPECTED=$("${PYTHON_BIN_PATH}" -c "print(${OUTPUT} == ${EXPECTED_OUTPUT})") -if [[ "${EQUALS_EXPECTED}" != "True" ]]; then - die "FAILED: Output from user op (${OUTPUT}) does not match expected "\ -"output ${EXPECTED_OUTPUT}" -else - echo "Output from user op (${OUTPUT}) matches expected output" -fi + if [[ "${EQUALS_EXPECTED}" != "True" ]]; then + local ERROR="FAILED: Output from user op (${OUTPUT}) does not match expected "\ + "output ${EXPECTED_OUTPUT}"${ADDITIONAL_LOG} + die ${ERROR} + else + echo "Output from user op (${OUTPUT}) matches expected output" + fi +} + +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))") +run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode" popd diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh index dbf376be6f7..2a9f2951888 100755 --- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh +++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh @@ -30,7 +30,10 @@ export PYTHON_BIN_PATH=`which python2` yes "" | $PYTHON_BIN_PATH configure.py # Run bazel test command. Double test timeouts to avoid flakes. +# Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution +# in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads +# caused by executing multiple tests concurrently. bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \ --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \ - --config=mkl --config=opt --test_output=errors -- \ + --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \ //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat index 97829892b10..3b437d3c58c 100644 --- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat +++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat @@ -31,6 +31,9 @@ IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Prog :: Set ctest binary location. IF DEFINED CTEST_EXE (ECHO CTEST_EXE is set to %CTEST_EXE%) ELSE (SET CTEST_EXE="C:\Program Files\cmake\bin\ctest.exe") +:: Install absl-py. +%PIP_EXE% install --upgrade absl-py + :: Run the CMAKE build to build the pip package. CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\gpu\cmake\run_build.bat if %errorlevel% neq 0 exit /b %errorlevel% @@ -40,9 +43,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file set /p WHEEL_FILENAME= const char* tf_git_version() {return "%s";} -const char* tf_compiler_version() {return __VERSION__;} +const char* tf_compiler_version() { +#ifdef _MSC_VER +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + return "MSVC " TOSTRING(_MSC_FULL_VER); +#else + return __VERSION__; +#endif +} const int tf_cxx11_abi_flag() { #ifdef _GLIBCXX_USE_CXX11_ABI return _GLIBCXX_USE_CXX11_ABI; @@ -197,7 +216,7 @@ const int tf_monolithic_build() { open(filename, "w").write(contents) -def generate(arglist): +def generate(arglist, git_tag_override=None): """Generate version_info.cc as given `destination_file`. Args: @@ -217,6 +236,10 @@ def generate(arglist): `ref_symlink` is unused in this script but passed, because the build system uses that file to detect when commits happen. + git_tag_override: Override the value for the git tag. This is useful for + releases where we want to build the release before the git tag is + created. + Raises: RuntimeError: If ./configure needs to be run, RuntimeError will be raised. """ @@ -234,11 +257,11 @@ def generate(arglist): raise RuntimeError( "Run ./configure again, branch was '%s' but is now '%s'" % (old_branch, new_branch)) - git_version = get_git_version(data["path"]) + git_version = get_git_version(data["path"], git_tag_override) write_version_info(dest_file, git_version) -def raw_generate(output_file): +def raw_generate(output_file, source_dir, git_tag_override=None): """Simple generator used for cmake/make build systems. This does not create any symlinks. It requires the build system @@ -246,9 +269,13 @@ def raw_generate(output_file): Args: output_file: Output filename for the version info cc + source_dir: Base path of the source code + git_tag_override: Override the value for the git tag. This is useful for + releases where we want to build the release before the git tag is + created. """ - git_version = get_git_version(".") + git_version = get_git_version(source_dir, git_tag_override) write_version_info(output_file, git_version) @@ -270,6 +297,11 @@ parser.add_argument( "--gen_root_path", type=str, help="Root path to place generated git files (created by --configure).") +parser.add_argument( + "--git_tag_override", type=str, + help="Override git tag value in the __git_version__ string. Useful when " + "creating release builds before the release tag is created.") + parser.add_argument( "--generate", type=str, @@ -281,6 +313,11 @@ parser.add_argument( type=str, help="Generate version_info.cc (simpler version used for cmake/make)") +parser.add_argument( + "--source_dir", + type=str, + help="Base path of the source code (used for cmake/make)") + args = parser.parse_args() if args.configure is not None: @@ -288,9 +325,12 @@ if args.configure is not None: raise RuntimeError("Must pass --gen_root_path arg when running --configure") configure(args.configure, args.gen_root_path, debug=args.debug) elif args.generate is not None: - generate(args.generate) + generate(args.generate, args.git_tag_override) elif args.raw_generate is not None: - raw_generate(args.raw_generate) + source_path = "." + if args.source_dir is not None: + source_path = args.source_dir + raw_generate(args.raw_generate, source_path, args.git_tag_override) else: raise RuntimeError("--configure or --generate or --raw_generate " "must be used") diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh index db20bb00e84..cd128af6b36 100755 --- a/tensorflow/tools/git/gen_git_source.sh +++ b/tensorflow/tools/git/gen_git_source.sh @@ -28,7 +28,15 @@ fi cat < ${OUTPUT_FILENAME} #include const char* tf_git_version() {return "${GIT_VERSION}";} -const char* tf_compiler_version() {return __VERSION__;} +const char* tf_compiler_version() { +#ifdef _MSC_VER +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + return "MSVC " TOSTRING(_MSC_FULL_VER); +#else + return __VERSION__; +#endif +} const int tf_cxx11_abi_flag() { #ifdef _GLIBCXX_USE_CXX11_ABI return _GLIBCXX_USE_CXX11_ABI; diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc index 28387c2b48c..8ce8f5e24b9 100644 --- a/tensorflow/tools/graph_transforms/transform_graph.cc +++ b/tensorflow/tools/graph_transforms/transform_graph.cc @@ -24,6 +24,9 @@ limitations under the License. #include "tensorflow/core/util/command_line_flags.h" #include "tensorflow/tools/graph_transforms/file_utils.h" #include "tensorflow/tools/graph_transforms/transform_utils.h" +#if !defined(PLATFORM_WINDOWS) +#include +#endif namespace tensorflow { namespace graph_transforms { @@ -130,16 +133,64 @@ Status ParseTransformParameters(const string& transforms_string, return Status::OK(); } +std::string ExpandPath(const std::string& path_string) { +#if defined(PLATFORM_WINDOWS) + return path_string; +#else + if (path_string.empty() || path_string[0] != '~') { + return path_string; + } + + const char* home = NULL; + std::string::size_type prefix = path_string.find_first_of('/'); + if (path_string.length() == 1 || prefix == 1) { + // The value of $HOME, e.g., ~/foo + home = getenv("HOME"); + if (!home) { + // If HOME is not available, get uid + struct passwd* pw = getpwuid(getuid()); + if (pw) { + home = pw->pw_dir; + } + } + } else { + // The value of ~user, e.g., ~user/foo + std::string user(path_string, 1, (prefix == std::string::npos) + ? std::string::npos + : prefix - 1); + struct passwd* pw = getpwnam(user.c_str()); + if (pw) { + home = pw->pw_dir; + } + } + + if (!home) { + return path_string; + } + + string path(home); + if (prefix == std::string::npos) { + return path; + } + + if (path.length() == 0 || path[path.length() - 1] != '/') { + path += '/'; + } + path += path_string.substr(prefix + 1); + return path; +#endif +} + int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) { - string in_graph = ""; - string out_graph = ""; + string in_graph_string = ""; + string out_graph_string = ""; string inputs_string = ""; string outputs_string = ""; string transforms_string = ""; bool output_as_text = false; std::vector flag_list = { - Flag("in_graph", &in_graph, "input graph file name"), - Flag("out_graph", &out_graph, "output graph file name"), + Flag("in_graph", &in_graph_string, "input graph file name"), + Flag("out_graph", &out_graph_string, "output graph file name"), Flag("inputs", &inputs_string, "inputs"), Flag("outputs", &outputs_string, "outputs"), Flag("transforms", &transforms_string, "list of transforms"), @@ -166,11 +217,11 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) { LOG(ERROR) << "Unknown argument " << argv[1] << ".\n" << usage; return -1; } - if (in_graph.empty()) { + if (in_graph_string.empty()) { LOG(ERROR) << "in_graph graph can't be empty.\n" << usage; return -1; } - if (out_graph.empty()) { + if (out_graph_string.empty()) { LOG(ERROR) << "out_graph graph can't be empty.\n" << usage; return -1; } @@ -179,6 +230,9 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) { return -1; } + string in_graph = ExpandPath(in_graph_string); + string out_graph = ExpandPath(out_graph_string); + std::vector inputs = str_util::Split(inputs_string, ','); std::vector outputs = str_util::Split(outputs_string, ','); TransformParameters transform_params; @@ -197,7 +251,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) { GraphDef graph_def; Status load_status = LoadTextOrBinaryGraphFile(in_graph, &graph_def); if (!load_status.ok()) { - LOG(ERROR) << "Loading graph '" << in_graph << "' failed with " + LOG(ERROR) << "Loading graph '" << in_graph_string << "' failed with " << load_status.error_message(); LOG(ERROR) << usage; return -1; @@ -219,7 +273,7 @@ int ParseFlagsAndTransformGraph(int argc, char* argv[], bool init_main) { save_status = WriteBinaryProto(Env::Default(), out_graph, graph_def); } if (!save_status.ok()) { - LOG(ERROR) << "Saving graph '" << out_graph << "' failed with " + LOG(ERROR) << "Saving graph '" << out_graph_string << "' failed with " << save_status.error_message(); return -1; } diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 211f93296bb..f84a91d009f 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -31,7 +31,7 @@ from setuptools.dist import Distribution # This version string is semver compatible, but incompatible with pip. # For pip, we will remove all '-' characters from this string, and use the # result for pip. -_VERSION = '1.7.0' +_VERSION = '1.8.0-rc0' REQUIRED_PACKAGES = [ 'absl-py >= 0.1.6', diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index bbef4b9e5f9..8b26a32eac1 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -167,11 +167,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "gemmlowp", urls = [ - "https://mirror.bazel.build/github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip", - "https://github.com/google/gemmlowp/archive/7c7c744640ddc3d0af18fb245b4d23228813a71b.zip", + # TODO (yongtang): uncomment once mirror.bazel.build is propagated. + # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip", + "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip", ], - sha256 = "b852cc90259a7357c8a323f108f2cec6e85979fc3b18b5590b99e0130044b2cf", - strip_prefix = "gemmlowp-7c7c744640ddc3d0af18fb245b4d23228813a71b", + sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658", + strip_prefix = "gemmlowp-38ebac7b059e84692f53e5938f97a9943c120d98", ) tf_http_archive( diff --git a/third_party/repo.bzl b/third_party/repo.bzl index aa178fa8cab..36f5aa5bdee 100644 --- a/third_party/repo.bzl +++ b/third_party/repo.bzl @@ -17,6 +17,7 @@ _SINGLE_URL_WHITELIST = depset([ "arm_compiler", "ortools_archive", + "gemmlowp", ]) def _is_windows(ctx): @@ -68,7 +69,7 @@ def _apply_delete(ctx, paths): _execute_and_check_ret_code(ctx, cmd) def _tf_http_archive(ctx): - if ("mirror.bazel.build" not in ctx.attr.urls[0] or + if ("mirror.bazel.build" not in ctx.attr.urls[0] and (len(ctx.attr.urls) < 2 and ctx.attr.name not in _SINGLE_URL_WHITELIST)): fail("tf_http_archive(urls) must have redundant URLs. The " + From 1bb16a262900dce73e8d757d9ad29feed0c878ad Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 21:46:39 -0700 Subject: [PATCH 0646/1734] Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 194033378 --- tensorflow/go/op/wrappers.go | 2508 +++++++++++++++++----------------- 1 file changed, 1254 insertions(+), 1254 deletions(-) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index c31ca8b67a1..d038846c4f2 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -2243,81 +2243,170 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou return op.Output(0) } -// Returns the complex conjugate of a complex number. +// Gather slices from `params` into a Tensor with shape specified by `indices`. // -// Given a tensor `input` of complex numbers, this operation returns a tensor of -// complex numbers that are the complex conjugate of each element in `input`. The -// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the -// real part and *b* is the imaginary part. +// `indices` is an K-dimensional integer tensor, best thought of as a +// (K-1)-dimensional tensor of indices into `params`, where each element defines a +// slice of `params`: // -// The complex conjugate returned by this operation is of the form \\(a - bj\\). +// output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]] // -// For example: +// Whereas in @{tf.gather} `indices` defines slices into the first +// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the +// first `N` dimensions of `params`, where `N = indices.shape[-1]`. // +// The last dimension of `indices` can be at most the rank of +// `params`: +// +// indices.shape[-1] <= params.rank +// +// The last dimension of `indices` corresponds to elements +// (if `indices.shape[-1] == params.rank`) or slices +// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]` +// of `params`. The output tensor has shape +// +// indices.shape[:-1] + params.shape[indices.shape[-1]:] +// +// Note that on CPU, if an out of bound index is found, an error is returned. +// On GPU, if an out of bound index is found, a 0 is stored in the +// corresponding output value. +// +// Some examples below. +// +// Simple indexing into a matrix: +// +// ```python +// indices = [[0, 0], [1, 1]] +// params = [['a', 'b'], ['c', 'd']] +// output = ['a', 'd'] // ``` -// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j] -// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j] +// +// Slice indexing into a matrix: +// +// ```python +// indices = [[1], [0]] +// params = [['a', 'b'], ['c', 'd']] +// output = [['c', 'd'], ['a', 'b']] // ``` -func Conj(scope *Scope, input tf.Output) (output tf.Output) { +// +// Indexing into a 3-tensor: +// +// ```python +// indices = [[1]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = [[['a1', 'b1'], ['c1', 'd1']]] +// +// +// indices = [[0, 1], [1, 0]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = [['c0', 'd0'], ['a1', 'b1']] +// +// +// indices = [[0, 0, 1], [1, 0, 1]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = ['b0', 'b1'] +// ``` +// +// Batched indexing into a matrix: +// +// ```python +// indices = [[[0, 0]], [[0, 1]]] +// params = [['a', 'b'], ['c', 'd']] +// output = [['a'], ['b']] +// ``` +// +// Batched slice indexing into a matrix: +// +// ```python +// indices = [[[1]], [[0]]] +// params = [['a', 'b'], ['c', 'd']] +// output = [[['c', 'd']], [['a', 'b']]] +// ``` +// +// Batched indexing into a 3-tensor: +// +// ```python +// indices = [[[1]], [[0]]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = [[[['a1', 'b1'], ['c1', 'd1']]], +// [[['a0', 'b0'], ['c0', 'd0']]]] +// +// indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = [[['c0', 'd0'], ['a1', 'b1']], +// [['a0', 'b0'], ['c1', 'd1']]] +// +// +// indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]] +// params = [[['a0', 'b0'], ['c0', 'd0']], +// [['a1', 'b1'], ['c1', 'd1']]] +// output = [['b0', 'b1'], ['d0', 'c1']] +// ``` +// +// Arguments: +// params: The tensor from which to gather values. +// indices: Index tensor. +// +// Returns Values from `params` gathered from indices given by `indices`, with +// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`. +func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) { if scope.Err() != nil { return } opspec := tf.OpSpec{ - Type: "Conj", + Type: "GatherNd", Input: []tf.Input{ - input, + params, indices, }, } op := scope.AddOperation(opspec) return op.Output(0) } -// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum. -type ResourceSparseApplyMomentumAttr func(optionalAttr) +// GatherAttr is an optional argument to Gather. +type GatherAttr func(optionalAttr) -// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value. -// -// value: If `True`, updating of the var and accum tensors will be protected -// by a lock; otherwise the behavior is undefined, but may exhibit less -// contention. -// If not specified, defaults to false -func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr { +// GatherValidateIndices sets the optional validate_indices attribute to value. +// If not specified, defaults to true +func GatherValidateIndices(value bool) GatherAttr { return func(m optionalAttr) { - m["use_locking"] = value + m["validate_indices"] = value } } -// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value. +// Gather slices from `params` according to `indices`. // -// value: If `True`, the tensor passed to compute grad will be -// var - lr * momentum * accum, so in the end, the var you get is actually -// var - lr * momentum * accum. -// If not specified, defaults to false -func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr { - return func(m optionalAttr) { - m["use_nesterov"] = value - } -} - -// Update relevant entries in '*var' and '*accum' according to the momentum scheme. +// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D). +// Produces an output tensor with shape `indices.shape + params.shape[1:]` where: // -// Set use_nesterov = True if you want to use Nesterov momentum. +// ```python +// # Scalar indices +// output[:, ..., :] = params[indices, :, ... :] // -// That is for rows we have grad for, we update var and accum as follows: +// # Vector indices +// output[i, :, ..., :] = params[indices[i], :, ... :] // -// accum = accum * momentum + grad -// var -= lr * accum +// # Higher rank indices +// output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :] +// ``` // -// Arguments: -// var_: Should be from a Variable(). -// accum: Should be from a Variable(). -// lr: Learning rate. Must be a scalar. -// grad: The gradient. -// indices: A vector of indices into the first dimension of var and accum. -// momentum: Momentum. Must be a scalar. +// If `indices` is a permutation and `len(indices) == params.shape[0]` then +// this operation will permute `params` accordingly. // -// Returns the created operation. -func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) { +// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in +// `indices` are always validated to be within range. If assigned to GPU, +// out-of-bound indices result in safe but unspecified behavior, which may include +// raising an error. +// +//
+// +//
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) { if scope.Err() != nil { return } @@ -2326,13 +2415,14 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, a(attrs) } opspec := tf.OpSpec{ - Type: "ResourceSparseApplyMomentum", + Type: "Gather", Input: []tf.Input{ - var_, accum, lr, grad, indices, momentum, + params, indices, }, Attrs: attrs, } - return scope.AddOperation(opspec) + op := scope.AddOperation(opspec) + return op.Output(0) } // Clips tensor values to a specified min and max. @@ -4548,62 +4638,6 @@ func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min return op.Output(0), op.Output(1), op.Output(2) } -// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth. -type HistogramFixedWidthAttr func(optionalAttr) - -// HistogramFixedWidthDtype sets the optional dtype attribute to value. -// If not specified, defaults to DT_INT32 -func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr { - return func(m optionalAttr) { - m["dtype"] = value - } -} - -// Return histogram of values. -// -// Given the tensor `values`, this operation returns a rank 1 histogram counting -// the number of entries in `values` that fall into every bin. The bins are -// equal width and determined by the arguments `value_range` and `nbins`. -// -// ```python -// # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) -// nbins = 5 -// value_range = [0.0, 5.0] -// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] -// -// with tf.get_default_session() as sess: -// hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) -// variables.global_variables_initializer().run() -// sess.run(hist) => [2, 1, 1, 0, 2] -// ``` -// -// Arguments: -// values: Numeric `Tensor`. -// value_range: Shape [2] `Tensor` of same `dtype` as `values`. -// values <= value_range[0] will be mapped to hist[0], -// values >= value_range[1] will be mapped to hist[-1]. -// nbins: Scalar `int32 Tensor`. Number of histogram bins. -// -// Returns A 1-D `Tensor` holding histogram of values. -func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "HistogramFixedWidth", - Input: []tf.Input{ - values, value_range, nbins, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Adds Tensor 'bias' to Tensor 'input' for Quantized types. // // Broadcasts the values of bias on dimensions 0..N-2 of 'input'. @@ -7020,38 +7054,107 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke return sparse_indices, sparse_values, sparse_shapes, dense_values } -// Real-valued fast Fourier transform. +// DecodeRawAttr is an optional argument to DecodeRaw. +type DecodeRawAttr func(optionalAttr) + +// DecodeRawLittleEndian sets the optional little_endian attribute to value. // -// Computes the 1-dimensional discrete Fourier transform of a real-valued signal -// over the inner-most dimension of `input`. -// -// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the -// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term, -// followed by the `fft_length / 2` positive-frequency terms. -// -// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the -// corresponding dimension of `input`, the dimension is cropped. If it is larger, -// the dimension is padded with zeros. +// value: Whether the input `bytes` are in little-endian order. +// Ignored for `out_type` values that are stored in a single byte like +// `uint8`. +// If not specified, defaults to true +func DecodeRawLittleEndian(value bool) DecodeRawAttr { + return func(m optionalAttr) { + m["little_endian"] = value + } +} + +// Reinterpret the bytes of a string as a vector of numbers. // // Arguments: -// input: A float32 tensor. -// fft_length: An int32 tensor of shape [1]. The FFT length. +// bytes: All the elements must have the same length. // -// Returns A complex64 tensor of the same rank as `input`. The inner-most -// dimension of `input` is replaced with the `fft_length / 2 + 1` unique -// frequency components of its 1D Fourier transform. // -// @compatibility(numpy) -// Equivalent to np.fft.rfft -// @end_compatibility -func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) { +// Returns A Tensor with one more dimension than the input `bytes`. The +// added dimension will have size equal to the length of the elements +// of `bytes` divided by the number of bytes to represent `out_type`. +func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{"out_type": out_type} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "DecodeRaw", + Input: []tf.Input{ + bytes, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + +// Copy a tensor setting everything outside a central band in each innermost matrix +// +// to zero. +// +// The `band` part is computed as follows: +// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a +// tensor with the same shape where +// +// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`. +// +// The indicator function +// +// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && +// (num_upper < 0 || (n-m) <= num_upper)`. +// +// For example: +// +// ``` +// # if 'input' is [[ 0, 1, 2, 3] +// [-1, 0, 1, 2] +// [-2, -1, 0, 1] +// [-3, -2, -1, 0]], +// +// tf.matrix_band_part(input, 1, -1) ==> [[ 0, 1, 2, 3] +// [-1, 0, 1, 2] +// [ 0, -1, 0, 1] +// [ 0, 0, -1, 0]], +// +// tf.matrix_band_part(input, 2, 1) ==> [[ 0, 1, 0, 0] +// [-1, 0, 1, 0] +// [-2, -1, 0, 1] +// [ 0, -2, -1, 0]] +// ``` +// +// Useful special cases: +// +// ``` +// tf.matrix_band_part(input, 0, -1) ==> Upper triangular part. +// tf.matrix_band_part(input, -1, 0) ==> Lower triangular part. +// tf.matrix_band_part(input, 0, 0) ==> Diagonal. +// ``` +// +// Arguments: +// input: Rank `k` tensor. +// num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire +// lower triangle. +// num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep +// entire upper triangle. +// +// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor. +func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) { if scope.Err() != nil { return } opspec := tf.OpSpec{ - Type: "RFFT", + Type: "MatrixBandPart", Input: []tf.Input{ - input, fft_length, + input, num_lower, num_upper, }, } op := scope.AddOperation(opspec) @@ -8207,63 +8310,6 @@ func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min return op.Output(0), op.Output(1), op.Output(2) } -// GatherAttr is an optional argument to Gather. -type GatherAttr func(optionalAttr) - -// GatherValidateIndices sets the optional validate_indices attribute to value. -// If not specified, defaults to true -func GatherValidateIndices(value bool) GatherAttr { - return func(m optionalAttr) { - m["validate_indices"] = value - } -} - -// Gather slices from `params` according to `indices`. -// -// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D). -// Produces an output tensor with shape `indices.shape + params.shape[1:]` where: -// -// ```python -// # Scalar indices -// output[:, ..., :] = params[indices, :, ... :] -// -// # Vector indices -// output[i, :, ..., :] = params[indices[i], :, ... :] -// -// # Higher rank indices -// output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :] -// ``` -// -// If `indices` is a permutation and `len(indices) == params.shape[0]` then -// this operation will permute `params` accordingly. -// -// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in -// `indices` are always validated to be within range. If assigned to GPU, -// out-of-bound indices result in safe but unspecified behavior, which may include -// raising an error. -// -//
-// -//
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "Gather", - Input: []tf.Input{ - params, indices, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Returns the truth value of (x != y) element-wise. // // *NOTE*: `NotEqual` supports broadcasting. More about broadcasting @@ -8386,6 +8432,98 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional .. return op.Output(0), op.Output(1), op.Output(2) } +// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum. +type ResourceSparseApplyMomentumAttr func(optionalAttr) + +// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value. +// +// value: If `True`, updating of the var and accum tensors will be protected +// by a lock; otherwise the behavior is undefined, but may exhibit less +// contention. +// If not specified, defaults to false +func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr { + return func(m optionalAttr) { + m["use_locking"] = value + } +} + +// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value. +// +// value: If `True`, the tensor passed to compute grad will be +// var - lr * momentum * accum, so in the end, the var you get is actually +// var - lr * momentum * accum. +// If not specified, defaults to false +func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr { + return func(m optionalAttr) { + m["use_nesterov"] = value + } +} + +// Update relevant entries in '*var' and '*accum' according to the momentum scheme. +// +// Set use_nesterov = True if you want to use Nesterov momentum. +// +// That is for rows we have grad for, we update var and accum as follows: +// +// accum = accum * momentum + grad +// var -= lr * accum +// +// Arguments: +// var_: Should be from a Variable(). +// accum: Should be from a Variable(). +// lr: Learning rate. Must be a scalar. +// grad: The gradient. +// indices: A vector of indices into the first dimension of var and accum. +// momentum: Momentum. Must be a scalar. +// +// Returns the created operation. +func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "ResourceSparseApplyMomentum", + Input: []tf.Input{ + var_, accum, lr, grad, indices, momentum, + }, + Attrs: attrs, + } + return scope.AddOperation(opspec) +} + +// Returns the complex conjugate of a complex number. +// +// Given a tensor `input` of complex numbers, this operation returns a tensor of +// complex numbers that are the complex conjugate of each element in `input`. The +// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the +// real part and *b* is the imaginary part. +// +// The complex conjugate returned by this operation is of the form \\(a - bj\\). +// +// For example: +// +// ``` +// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j] +// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j] +// ``` +func Conj(scope *Scope, input tf.Output) (output tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "Conj", + Input: []tf.Input{ + input, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // ResizeBilinearAttr is an optional argument to ResizeBilinear. type ResizeBilinearAttr func(optionalAttr) @@ -9799,6 +9937,305 @@ func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, o return op.Output(0) } +// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg. +type DecodeAndCropJpegAttr func(optionalAttr) + +// DecodeAndCropJpegChannels sets the optional channels attribute to value. +// +// value: Number of color channels for the decoded image. +// If not specified, defaults to 0 +func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["channels"] = value + } +} + +// DecodeAndCropJpegRatio sets the optional ratio attribute to value. +// +// value: Downscaling ratio. +// If not specified, defaults to 1 +func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["ratio"] = value + } +} + +// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value. +// +// value: If true use a slower but nicer upscaling of the +// chroma planes (yuv420/422 only). +// If not specified, defaults to true +func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["fancy_upscaling"] = value + } +} + +// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value. +// +// value: If true try to recover an image from truncated input. +// If not specified, defaults to false +func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["try_recover_truncated"] = value + } +} + +// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value. +// +// value: The minimum required fraction of lines before a truncated +// input is accepted. +// If not specified, defaults to 1 +func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["acceptable_fraction"] = value + } +} + +// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value. +// +// value: string specifying a hint about the algorithm used for +// decompression. Defaults to "" which maps to a system-specific +// default. Currently valid values are ["INTEGER_FAST", +// "INTEGER_ACCURATE"]. The hint may be ignored (e.g., the internal +// jpeg library changes to a version that does not have that specific +// option.) +// If not specified, defaults to "" +func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr { + return func(m optionalAttr) { + m["dct_method"] = value + } +} + +// Decode and Crop a JPEG-encoded image to a uint8 tensor. +// +// The attr `channels` indicates the desired number of color channels for the +// decoded image. +// +// Accepted values are: +// +// * 0: Use the number of channels in the JPEG-encoded image. +// * 1: output a grayscale image. +// * 3: output an RGB image. +// +// If needed, the JPEG-encoded image is transformed to match the requested number +// of color channels. +// +// The attr `ratio` allows downscaling the image by an integer factor during +// decoding. Allowed values are: 1, 2, 4, and 8. This is much faster than +// downscaling the image later. +// +// +// It is equivalent to a combination of decode and crop, but much faster by only +// decoding partial jpeg image. +// +// Arguments: +// contents: 0-D. The JPEG-encoded image. +// crop_window: 1-D. The crop window: [crop_y, crop_x, crop_height, crop_width]. +// +// Returns 3-D with shape `[height, width, channels]`.. +func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "DecodeAndCropJpeg", + Input: []tf.Input{ + contents, crop_window, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + +// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler. +type AllCandidateSamplerAttr func(optionalAttr) + +// AllCandidateSamplerSeed sets the optional seed attribute to value. +// +// value: If either seed or seed2 are set to be non-zero, the random number +// generator is seeded by the given seed. Otherwise, it is seeded by a +// random seed. +// If not specified, defaults to 0 +func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr { + return func(m optionalAttr) { + m["seed"] = value + } +} + +// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value. +// +// value: An second seed to avoid seed collision. +// If not specified, defaults to 0 +func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr { + return func(m optionalAttr) { + m["seed2"] = value + } +} + +// Generates labels for candidate sampling with a learned unigram distribution. +// +// See explanations of candidate sampling and the data formats at +// go/candidate-sampling. +// +// For each batch, this op picks a single set of sampled candidate labels. +// +// The advantages of sampling candidates per-batch are simplicity and the +// possibility of efficient dense matrix multiplication. The disadvantage is that +// the sampled candidates must be chosen independently of the context and of the +// true labels. +// +// Arguments: +// true_classes: A batch_size * num_true matrix, in which each row contains the +// IDs of the num_true target_classes in the corresponding original label. +// num_true: Number of true labels per context. +// num_sampled: Number of candidates to produce. +// unique: If unique is true, we sample with rejection, so that all sampled +// candidates in a batch are unique. This requires some approximation to +// estimate the post-rejection sampling probabilities. +// +// Returns A vector of length num_sampled, in which each element is +// the ID of a sampled candidate.A batch_size * num_true matrix, representing +// the number of times each candidate is expected to occur in a batch +// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled +// candidate representing the number of times the candidate is expected +// to occur in a batch of sampled candidates. If unique=true, then this is a +// probability. +func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "AllCandidateSampler", + Input: []tf.Input{ + true_classes, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0), op.Output(1), op.Output(2) +} + +// Adds two `SparseTensor` objects to produce another `SparseTensor`. +// +// The input `SparseTensor` objects' indices are assumed ordered in standard +// lexicographic order. If this is not the case, before this step run +// `SparseReorder` to restore index ordering. +// +// By default, if two values sum to zero at some index, the output `SparseTensor` +// would still include that particular location in its index, storing a zero in the +// corresponding value slot. To override this, callers can specify `thresh`, +// indicating that if the sum has a magnitude strictly smaller than `thresh`, its +// corresponding value and index would then not be included. In particular, +// `thresh == 0` (default) means everything is kept and actual thresholding happens +// only for a positive value. +// +// In the following shapes, `nnz` is the count after taking `thresh` into account. +// +// Arguments: +// a_indices: 2-D. The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix. +// a_values: 1-D. The `values` of the first `SparseTensor`, size `[nnz]` Vector. +// a_shape: 1-D. The `shape` of the first `SparseTensor`, size `[ndims]` Vector. +// b_indices: 2-D. The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix. +// b_values: 1-D. The `values` of the second `SparseTensor`, size `[nnz]` Vector. +// b_shape: 1-D. The `shape` of the second `SparseTensor`, size `[ndims]` Vector. +// thresh: 0-D. The magnitude threshold that determines if an output value/index +// pair takes space. +func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "SparseAdd", + Input: []tf.Input{ + a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0), op.Output(1), op.Output(2) +} + +// OrderedMapPeekAttr is an optional argument to OrderedMapPeek. +type OrderedMapPeekAttr func(optionalAttr) + +// OrderedMapPeekCapacity sets the optional capacity attribute to value. +// If not specified, defaults to 0 +// +// REQUIRES: value >= 0 +func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr { + return func(m optionalAttr) { + m["capacity"] = value + } +} + +// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value. +// If not specified, defaults to 0 +// +// REQUIRES: value >= 0 +func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr { + return func(m optionalAttr) { + m["memory_limit"] = value + } +} + +// OrderedMapPeekContainer sets the optional container attribute to value. +// If not specified, defaults to "" +func OrderedMapPeekContainer(value string) OrderedMapPeekAttr { + return func(m optionalAttr) { + m["container"] = value + } +} + +// OrderedMapPeekSharedName sets the optional shared_name attribute to value. +// If not specified, defaults to "" +func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr { + return func(m optionalAttr) { + m["shared_name"] = value + } +} + +// Op peeks at the values at the specified key. If the +// +// underlying container does not contain this key +// this op will block until it does. This Op is optimized for +// performance. +func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{"dtypes": dtypes} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "OrderedMapPeek", + Input: []tf.Input{ + key, indices, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + if scope.Err() != nil { + return + } + var idx int + var err error + if values, idx, err = makeOutputList(op, idx, "values"); err != nil { + scope.UpdateErr("OrderedMapPeek", err) + return + } + return values +} + // Inverse fast Fourier transform. // // Computes the inverse 1-dimensional discrete Fourier transform over the @@ -9900,6 +10337,235 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso return scope.AddOperation(opspec) } +// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp. +type ResourceSparseApplyRMSPropAttr func(optionalAttr) + +// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value. +// +// value: If `True`, updating of the var, ms, and mom tensors is protected +// by a lock; otherwise the behavior is undefined, but may exhibit less +// contention. +// If not specified, defaults to false +func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr { + return func(m optionalAttr) { + m["use_locking"] = value + } +} + +// Update '*var' according to the RMSProp algorithm. +// +// Note that in dense implementation of this algorithm, ms and mom will +// update even if the grad is zero, but in this sparse implementation, ms +// and mom will not update in iterations during which the grad is zero. +// +// mean_square = decay * mean_square + (1-decay) * gradient ** 2 +// Delta = learning_rate * gradient / sqrt(mean_square + epsilon) +// +// ms <- rho * ms_{t-1} + (1-rho) * grad * grad +// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +// var <- var - mom +// +// Arguments: +// var_: Should be from a Variable(). +// ms: Should be from a Variable(). +// mom: Should be from a Variable(). +// lr: Scaling factor. Must be a scalar. +// rho: Decay rate. Must be a scalar. +// +// epsilon: Ridge term. Must be a scalar. +// grad: The gradient. +// indices: A vector of indices into the first dimension of var, ms and mom. +// +// Returns the created operation. +func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "ResourceSparseApplyRMSProp", + Input: []tf.Input{ + var_, ms, mom, lr, rho, momentum, epsilon, grad, indices, + }, + Attrs: attrs, + } + return scope.AddOperation(opspec) +} + +// Returns the truth value of (x > y) element-wise. +// +// *NOTE*: `Greater` supports broadcasting. More about broadcasting +// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) +func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "Greater", + Input: []tf.Input{ + x, y, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + +// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox. +type SampleDistortedBoundingBoxAttr func(optionalAttr) + +// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value. +// +// value: If either `seed` or `seed2` are set to non-zero, the random number +// generator is seeded by the given `seed`. Otherwise, it is seeded by a random +// seed. +// If not specified, defaults to 0 +func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["seed"] = value + } +} + +// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value. +// +// value: A second seed to avoid seed collision. +// If not specified, defaults to 0 +func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["seed2"] = value + } +} + +// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value. +// +// value: The cropped area of the image must contain at least this +// fraction of any bounding box supplied. The value of this parameter should be +// non-negative. In the case of 0, the cropped area does not need to overlap +// any of the bounding boxes supplied. +// If not specified, defaults to 0.1 +func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["min_object_covered"] = value + } +} + +// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value. +// +// value: The cropped area of the image must have an aspect ratio = +// width / height within this range. +// If not specified, defaults to +func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["aspect_ratio_range"] = value + } +} + +// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value. +// +// value: The cropped area of the image must contain a fraction of the +// supplied image within in this range. +// If not specified, defaults to +func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["area_range"] = value + } +} + +// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value. +// +// value: Number of attempts at generating a cropped region of the image +// of the specified constraints. After `max_attempts` failures, return the entire +// image. +// If not specified, defaults to 100 +func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["max_attempts"] = value + } +} + +// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value. +// +// value: Controls behavior if no bounding boxes supplied. +// If true, assume an implicit bounding box covering the whole input. If false, +// raise an error. +// If not specified, defaults to false +func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr { + return func(m optionalAttr) { + m["use_image_if_no_bounding_boxes"] = value + } +} + +// Generate a single randomly distorted bounding box for an image. +// +// Bounding box annotations are often supplied in addition to ground-truth labels +// in image recognition or object localization tasks. A common technique for +// training such a system is to randomly distort an image while preserving +// its content, i.e. *data augmentation*. This Op outputs a randomly distorted +// localization of an object, i.e. bounding box, given an `image_size`, +// `bounding_boxes` and a series of constraints. +// +// The output of this Op is a single bounding box that may be used to crop the +// original image. The output is returned as 3 tensors: `begin`, `size` and +// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the +// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize +// what the bounding box looks like. +// +// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The +// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and +// height of the underlying image. +// +// For example, +// +// ```python +// # Generate a single distorted bounding box. +// begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box( +// tf.shape(image), +// bounding_boxes=bounding_boxes) +// +// # Draw the bounding box in an image summary. +// image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), +// bbox_for_draw) +// tf.summary.image('images_with_box', image_with_box) +// +// # Employ the bounding box to distort the image. +// distorted_image = tf.slice(image, begin, size) +// ``` +// +// Note that if no bounding box information is available, setting +// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit +// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is +// false and no bounding boxes are supplied, an error is raised. +// +// Arguments: +// image_size: 1-D, containing `[height, width, channels]`. +// bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes +// associated with the image. +// +// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to +// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to +// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box. +// Provide as input to `tf.image.draw_bounding_boxes`. +func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "SampleDistortedBoundingBox", + Input: []tf.Input{ + image_size, bounding_boxes, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0), op.Output(1), op.Output(2) +} + // LRNAttr is an optional argument to LRN. type LRNAttr func(optionalAttr) @@ -10042,159 +10708,6 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l return scope.AddOperation(opspec) } -// 2D real-valued fast Fourier transform. -// -// Computes the 2-dimensional discrete Fourier transform of a real-valued signal -// over the inner-most 2 dimensions of `input`. -// -// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the -// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension -// of `output`: the zero-frequency term, followed by the `fft_length / 2` -// positive-frequency terms. -// -// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the -// corresponding dimension of `input`, the dimension is cropped. If it is larger, -// the dimension is padded with zeros. -// -// Arguments: -// input: A float32 tensor. -// fft_length: An int32 tensor of shape [2]. The FFT length for each dimension. -// -// Returns A complex64 tensor of the same rank as `input`. The inner-most 2 -// dimensions of `input` are replaced with their 2D Fourier transform. The -// inner-most dimension contains `fft_length / 2 + 1` unique frequency -// components. -// -// @compatibility(numpy) -// Equivalent to np.fft.rfft2 -// @end_compatibility -func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "RFFT2D", - Input: []tf.Input{ - input, fft_length, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// ResizeAreaAttr is an optional argument to ResizeArea. -type ResizeAreaAttr func(optionalAttr) - -// ResizeAreaAlignCorners sets the optional align_corners attribute to value. -// -// value: If true, the centers of the 4 corner pixels of the input and output tensors are -// aligned, preserving the values at the corner pixels. Defaults to false. -// If not specified, defaults to false -func ResizeAreaAlignCorners(value bool) ResizeAreaAttr { - return func(m optionalAttr) { - m["align_corners"] = value - } -} - -// Resize `images` to `size` using area interpolation. -// -// Input images can be of different types but output images are always float. -// -// The range of pixel values for the output image might be slightly different -// from the range for the input image because of limited numerical precision. -// To guarantee an output range, for example `[0.0, 1.0]`, apply -// `tf.clip_by_value` to the output. -// -// Each output pixel is computed by first transforming the pixel's footprint into -// the input tensor and then averaging the pixels that intersect the footprint. An -// input pixel's contribution to the average is weighted by the fraction of its -// area that intersects the footprint. This is the same as OpenCV's INTER_AREA. -// -// Arguments: -// images: 4-D with shape `[batch, height, width, channels]`. -// size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`. The -// new size for the images. -// -// Returns 4-D with shape -// `[batch, new_height, new_width, channels]`. -func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "ResizeArea", - Input: []tf.Input{ - images, size, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// Pads a tensor with zeros. -// -// This operation pads a `input` with zeros according to the `paddings` you -// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the -// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates -// how many zeros to add before the contents of `input` in that dimension, and -// `paddings[D, 1]` indicates how many zeros to add after the contents of `input` -// in that dimension. -// -// The padded size of each dimension D of the output is: -// -// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)` -// -// For example: -// -// ``` -// # 't' is [[1, 1], [2, 2]] -// # 'paddings' is [[1, 1], [2, 2]] -// # rank of 't' is 2 -// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0] -// [0, 0, 1, 1, 0, 0] -// [0, 0, 2, 2, 0, 0] -// [0, 0, 0, 0, 0, 0]] -// ``` -func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "Pad", - Input: []tf.Input{ - input, paddings, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// Checks whether a resource handle-based variable has been initialized. -// -// Arguments: -// resource: the input resource handle. -// -// Returns a scalar boolean which is true if the variable has been -// initialized. -func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "VarIsInitializedOp", - Input: []tf.Input{ - resource, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform. type StatelessRandomUniformAttr func(optionalAttr) @@ -10804,47 +11317,42 @@ func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output return op.Output(0) } -// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp. -type ResourceSparseApplyRMSPropAttr func(optionalAttr) +// ResizeAreaAttr is an optional argument to ResizeArea. +type ResizeAreaAttr func(optionalAttr) -// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value. +// ResizeAreaAlignCorners sets the optional align_corners attribute to value. // -// value: If `True`, updating of the var, ms, and mom tensors is protected -// by a lock; otherwise the behavior is undefined, but may exhibit less -// contention. +// value: If true, the centers of the 4 corner pixels of the input and output tensors are +// aligned, preserving the values at the corner pixels. Defaults to false. // If not specified, defaults to false -func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr { +func ResizeAreaAlignCorners(value bool) ResizeAreaAttr { return func(m optionalAttr) { - m["use_locking"] = value + m["align_corners"] = value } } -// Update '*var' according to the RMSProp algorithm. +// Resize `images` to `size` using area interpolation. // -// Note that in dense implementation of this algorithm, ms and mom will -// update even if the grad is zero, but in this sparse implementation, ms -// and mom will not update in iterations during which the grad is zero. +// Input images can be of different types but output images are always float. // -// mean_square = decay * mean_square + (1-decay) * gradient ** 2 -// Delta = learning_rate * gradient / sqrt(mean_square + epsilon) +// The range of pixel values for the output image might be slightly different +// from the range for the input image because of limited numerical precision. +// To guarantee an output range, for example `[0.0, 1.0]`, apply +// `tf.clip_by_value` to the output. // -// ms <- rho * ms_{t-1} + (1-rho) * grad * grad -// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) -// var <- var - mom +// Each output pixel is computed by first transforming the pixel's footprint into +// the input tensor and then averaging the pixels that intersect the footprint. An +// input pixel's contribution to the average is weighted by the fraction of its +// area that intersects the footprint. This is the same as OpenCV's INTER_AREA. // // Arguments: -// var_: Should be from a Variable(). -// ms: Should be from a Variable(). -// mom: Should be from a Variable(). -// lr: Scaling factor. Must be a scalar. -// rho: Decay rate. Must be a scalar. +// images: 4-D with shape `[batch, height, width, channels]`. +// size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`. The +// new size for the images. // -// epsilon: Ridge term. Must be a scalar. -// grad: The gradient. -// indices: A vector of indices into the first dimension of var, ms and mom. -// -// Returns the created operation. -func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) { +// Returns 4-D with shape +// `[batch, new_height, new_width, channels]`. +func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) { if scope.Err() != nil { return } @@ -10853,184 +11361,113 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom a(attrs) } opspec := tf.OpSpec{ - Type: "ResourceSparseApplyRMSProp", + Type: "ResizeArea", Input: []tf.Input{ - var_, ms, mom, lr, rho, momentum, epsilon, grad, indices, + images, size, }, Attrs: attrs, } - return scope.AddOperation(opspec) + op := scope.AddOperation(opspec) + return op.Output(0) } -// Returns the truth value of (x > y) element-wise. +// 2D real-valued fast Fourier transform. // -// *NOTE*: `Greater` supports broadcasting. More about broadcasting -// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) -func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) { +// Computes the 2-dimensional discrete Fourier transform of a real-valued signal +// over the inner-most 2 dimensions of `input`. +// +// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the +// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension +// of `output`: the zero-frequency term, followed by the `fft_length / 2` +// positive-frequency terms. +// +// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the +// corresponding dimension of `input`, the dimension is cropped. If it is larger, +// the dimension is padded with zeros. +// +// Arguments: +// input: A float32 tensor. +// fft_length: An int32 tensor of shape [2]. The FFT length for each dimension. +// +// Returns A complex64 tensor of the same rank as `input`. The inner-most 2 +// dimensions of `input` are replaced with their 2D Fourier transform. The +// inner-most dimension contains `fft_length / 2 + 1` unique frequency +// components. +// +// @compatibility(numpy) +// Equivalent to np.fft.rfft2 +// @end_compatibility +func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) { if scope.Err() != nil { return } opspec := tf.OpSpec{ - Type: "Greater", + Type: "RFFT2D", Input: []tf.Input{ - x, y, + input, fft_length, }, } op := scope.AddOperation(opspec) return op.Output(0) } -// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox. -type SampleDistortedBoundingBoxAttr func(optionalAttr) - -// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value. +// Pads a tensor with zeros. // -// value: If either `seed` or `seed2` are set to non-zero, the random number -// generator is seeded by the given `seed`. Otherwise, it is seeded by a random -// seed. -// If not specified, defaults to 0 -func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["seed"] = value - } -} - -// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value. +// This operation pads a `input` with zeros according to the `paddings` you +// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the +// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates +// how many zeros to add before the contents of `input` in that dimension, and +// `paddings[D, 1]` indicates how many zeros to add after the contents of `input` +// in that dimension. // -// value: A second seed to avoid seed collision. -// If not specified, defaults to 0 -func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["seed2"] = value - } -} - -// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value. +// The padded size of each dimension D of the output is: // -// value: The cropped area of the image must contain at least this -// fraction of any bounding box supplied. The value of this parameter should be -// non-negative. In the case of 0, the cropped area does not need to overlap -// any of the bounding boxes supplied. -// If not specified, defaults to 0.1 -func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["min_object_covered"] = value - } -} - -// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value. +// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)` // -// value: The cropped area of the image must have an aspect ratio = -// width / height within this range. -// If not specified, defaults to -func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["aspect_ratio_range"] = value - } -} - -// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value. +// For example: // -// value: The cropped area of the image must contain a fraction of the -// supplied image within in this range. -// If not specified, defaults to -func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["area_range"] = value - } -} - -// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value. -// -// value: Number of attempts at generating a cropped region of the image -// of the specified constraints. After `max_attempts` failures, return the entire -// image. -// If not specified, defaults to 100 -func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["max_attempts"] = value - } -} - -// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value. -// -// value: Controls behavior if no bounding boxes supplied. -// If true, assume an implicit bounding box covering the whole input. If false, -// raise an error. -// If not specified, defaults to false -func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr { - return func(m optionalAttr) { - m["use_image_if_no_bounding_boxes"] = value - } -} - -// Generate a single randomly distorted bounding box for an image. -// -// Bounding box annotations are often supplied in addition to ground-truth labels -// in image recognition or object localization tasks. A common technique for -// training such a system is to randomly distort an image while preserving -// its content, i.e. *data augmentation*. This Op outputs a randomly distorted -// localization of an object, i.e. bounding box, given an `image_size`, -// `bounding_boxes` and a series of constraints. -// -// The output of this Op is a single bounding box that may be used to crop the -// original image. The output is returned as 3 tensors: `begin`, `size` and -// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the -// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize -// what the bounding box looks like. -// -// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The -// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and -// height of the underlying image. -// -// For example, -// -// ```python -// # Generate a single distorted bounding box. -// begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box( -// tf.shape(image), -// bounding_boxes=bounding_boxes) -// -// # Draw the bounding box in an image summary. -// image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), -// bbox_for_draw) -// tf.summary.image('images_with_box', image_with_box) -// -// # Employ the bounding box to distort the image. -// distorted_image = tf.slice(image, begin, size) // ``` -// -// Note that if no bounding box information is available, setting -// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit -// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is -// false and no bounding boxes are supplied, an error is raised. -// -// Arguments: -// image_size: 1-D, containing `[height, width, channels]`. -// bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes -// associated with the image. -// -// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to -// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to -// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box. -// Provide as input to `tf.image.draw_bounding_boxes`. -func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) { +// # 't' is [[1, 1], [2, 2]] +// # 'paddings' is [[1, 1], [2, 2]] +// # rank of 't' is 2 +// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0] +// [0, 0, 1, 1, 0, 0] +// [0, 0, 2, 2, 0, 0] +// [0, 0, 0, 0, 0, 0]] +// ``` +func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) { if scope.Err() != nil { return } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } opspec := tf.OpSpec{ - Type: "SampleDistortedBoundingBox", + Type: "Pad", Input: []tf.Input{ - image_size, bounding_boxes, + input, paddings, }, - Attrs: attrs, } op := scope.AddOperation(opspec) - return op.Output(0), op.Output(1), op.Output(2) + return op.Output(0) +} + +// Checks whether a resource handle-based variable has been initialized. +// +// Arguments: +// resource: the input resource handle. +// +// Returns a scalar boolean which is true if the variable has been +// initialized. +func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "VarIsInitializedOp", + Input: []tf.Input{ + resource, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) } // Converts each string in the input Tensor to its hash mod by a number of buckets. @@ -13698,6 +14135,44 @@ func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filenam return scope.AddOperation(opspec) } +// Real-valued fast Fourier transform. +// +// Computes the 1-dimensional discrete Fourier transform of a real-valued signal +// over the inner-most dimension of `input`. +// +// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the +// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term, +// followed by the `fft_length / 2` positive-frequency terms. +// +// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the +// corresponding dimension of `input`, the dimension is cropped. If it is larger, +// the dimension is padded with zeros. +// +// Arguments: +// input: A float32 tensor. +// fft_length: An int32 tensor of shape [1]. The FFT length. +// +// Returns A complex64 tensor of the same rank as `input`. The inner-most +// dimension of `input` is replaced with the `fft_length / 2 + 1` unique +// frequency components of its 1D Fourier transform. +// +// @compatibility(numpy) +// Equivalent to np.fft.rfft +// @end_compatibility +func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "RFFT", + Input: []tf.Input{ + input, fft_length, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // QuantizedReluAttr is an optional argument to QuantizedRelu. type QuantizedReluAttr func(optionalAttr) @@ -15418,6 +15893,216 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output return op.Output(0) } +// SkipgramAttr is an optional argument to Skipgram. +type SkipgramAttr func(optionalAttr) + +// SkipgramWindowSize sets the optional window_size attribute to value. +// +// value: The number of words to predict to the left and right of the target. +// If not specified, defaults to 5 +func SkipgramWindowSize(value int64) SkipgramAttr { + return func(m optionalAttr) { + m["window_size"] = value + } +} + +// SkipgramMinCount sets the optional min_count attribute to value. +// +// value: The minimum number of word occurrences for it to be included in the +// vocabulary. +// If not specified, defaults to 5 +func SkipgramMinCount(value int64) SkipgramAttr { + return func(m optionalAttr) { + m["min_count"] = value + } +} + +// SkipgramSubsample sets the optional subsample attribute to value. +// +// value: Threshold for word occurrence. Words that appear with higher +// frequency will be randomly down-sampled. Set to 0 to disable. +// If not specified, defaults to 0.001 +func SkipgramSubsample(value float32) SkipgramAttr { + return func(m optionalAttr) { + m["subsample"] = value + } +} + +// Parses a text file and creates a batch of examples. +// +// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result +// +// Arguments: +// filename: The corpus's text file name. +// batch_size: The size of produced batch. +// +// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids. +func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "Skipgram", + + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6) +} + +// StringToNumberAttr is an optional argument to StringToNumber. +type StringToNumberAttr func(optionalAttr) + +// StringToNumberOutType sets the optional out_type attribute to value. +// +// value: The numeric type to interpret each string in `string_tensor` as. +// If not specified, defaults to DT_FLOAT +func StringToNumberOutType(value tf.DataType) StringToNumberAttr { + return func(m optionalAttr) { + m["out_type"] = value + } +} + +// Converts each string in the input Tensor to the specified numeric type. +// +// (Note that int32 overflow results in an error while float overflow +// results in a rounded value.) +// +// Returns A Tensor of the same shape as the input `string_tensor`. +func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "StringToNumber", + Input: []tf.Input{ + string_tensor, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + +// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2. +type ResourceApplyFtrlV2Attr func(optionalAttr) + +// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value. +// +// value: If `True`, updating of the var and accum tensors will be protected +// by a lock; otherwise the behavior is undefined, but may exhibit less +// contention. +// If not specified, defaults to false +func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr { + return func(m optionalAttr) { + m["use_locking"] = value + } +} + +// Update '*var' according to the Ftrl-proximal scheme. +// +// grad_with_shrinkage = grad + 2 * l2_shrinkage * var +// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage +// linear += grad_with_shrinkage + +// (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +// accum = accum_new +// +// Arguments: +// var_: Should be from a Variable(). +// accum: Should be from a Variable(). +// linear: Should be from a Variable(). +// grad: The gradient. +// lr: Scaling factor. Must be a scalar. +// l1: L1 regulariation. Must be a scalar. +// l2: L2 shrinkage regulariation. Must be a scalar. +// +// lr_power: Scaling factor. Must be a scalar. +// +// Returns the created operation. +func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "ResourceApplyFtrlV2", + Input: []tf.Input{ + var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power, + }, + Attrs: attrs, + } + return scope.AddOperation(opspec) +} + +// TruncatedNormalAttr is an optional argument to TruncatedNormal. +type TruncatedNormalAttr func(optionalAttr) + +// TruncatedNormalSeed sets the optional seed attribute to value. +// +// value: If either `seed` or `seed2` are set to be non-zero, the random number +// generator is seeded by the given seed. Otherwise, it is seeded by a +// random seed. +// If not specified, defaults to 0 +func TruncatedNormalSeed(value int64) TruncatedNormalAttr { + return func(m optionalAttr) { + m["seed"] = value + } +} + +// TruncatedNormalSeed2 sets the optional seed2 attribute to value. +// +// value: A second seed to avoid seed collision. +// If not specified, defaults to 0 +func TruncatedNormalSeed2(value int64) TruncatedNormalAttr { + return func(m optionalAttr) { + m["seed2"] = value + } +} + +// Outputs random values from a truncated normal distribution. +// +// The generated values follow a normal distribution with mean 0 and standard +// deviation 1, except that values whose magnitude is more than 2 standard +// deviations from the mean are dropped and re-picked. +// +// Arguments: +// shape: The shape of the output tensor. +// dtype: The type of the output. +// +// Returns A tensor of the specified shape filled with random truncated normal +// values. +func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{"dtype": dtype} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "TruncatedNormal", + Input: []tf.Input{ + shape, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2. type MutableDenseHashTableV2Attr func(optionalAttr) @@ -16053,6 +16738,62 @@ func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) { return op.Output(0) } +// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth. +type HistogramFixedWidthAttr func(optionalAttr) + +// HistogramFixedWidthDtype sets the optional dtype attribute to value. +// If not specified, defaults to DT_INT32 +func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr { + return func(m optionalAttr) { + m["dtype"] = value + } +} + +// Return histogram of values. +// +// Given the tensor `values`, this operation returns a rank 1 histogram counting +// the number of entries in `values` that fall into every bin. The bins are +// equal width and determined by the arguments `value_range` and `nbins`. +// +// ```python +// # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) +// nbins = 5 +// value_range = [0.0, 5.0] +// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] +// +// with tf.get_default_session() as sess: +// hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) +// variables.global_variables_initializer().run() +// sess.run(hist) => [2, 1, 1, 0, 2] +// ``` +// +// Arguments: +// values: Numeric `Tensor`. +// value_range: Shape [2] `Tensor` of same `dtype` as `values`. +// values <= value_range[0] will be mapped to hist[0], +// values >= value_range[1] will be mapped to hist[-1]. +// nbins: Scalar `int32 Tensor`. Number of histogram bins. +// +// Returns A 1-D `Tensor` holding histogram of values. +func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "HistogramFixedWidth", + Input: []tf.Input{ + values, value_range, nbins, + }, + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Returns the truth value of (x >= y) element-wise. // // *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting @@ -16561,305 +17302,6 @@ func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) { return scope.AddOperation(opspec) } -// Adds two `SparseTensor` objects to produce another `SparseTensor`. -// -// The input `SparseTensor` objects' indices are assumed ordered in standard -// lexicographic order. If this is not the case, before this step run -// `SparseReorder` to restore index ordering. -// -// By default, if two values sum to zero at some index, the output `SparseTensor` -// would still include that particular location in its index, storing a zero in the -// corresponding value slot. To override this, callers can specify `thresh`, -// indicating that if the sum has a magnitude strictly smaller than `thresh`, its -// corresponding value and index would then not be included. In particular, -// `thresh == 0` (default) means everything is kept and actual thresholding happens -// only for a positive value. -// -// In the following shapes, `nnz` is the count after taking `thresh` into account. -// -// Arguments: -// a_indices: 2-D. The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix. -// a_values: 1-D. The `values` of the first `SparseTensor`, size `[nnz]` Vector. -// a_shape: 1-D. The `shape` of the first `SparseTensor`, size `[ndims]` Vector. -// b_indices: 2-D. The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix. -// b_values: 1-D. The `values` of the second `SparseTensor`, size `[nnz]` Vector. -// b_shape: 1-D. The `shape` of the second `SparseTensor`, size `[ndims]` Vector. -// thresh: 0-D. The magnitude threshold that determines if an output value/index -// pair takes space. -func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "SparseAdd", - Input: []tf.Input{ - a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0), op.Output(1), op.Output(2) -} - -// OrderedMapPeekAttr is an optional argument to OrderedMapPeek. -type OrderedMapPeekAttr func(optionalAttr) - -// OrderedMapPeekCapacity sets the optional capacity attribute to value. -// If not specified, defaults to 0 -// -// REQUIRES: value >= 0 -func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr { - return func(m optionalAttr) { - m["capacity"] = value - } -} - -// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value. -// If not specified, defaults to 0 -// -// REQUIRES: value >= 0 -func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr { - return func(m optionalAttr) { - m["memory_limit"] = value - } -} - -// OrderedMapPeekContainer sets the optional container attribute to value. -// If not specified, defaults to "" -func OrderedMapPeekContainer(value string) OrderedMapPeekAttr { - return func(m optionalAttr) { - m["container"] = value - } -} - -// OrderedMapPeekSharedName sets the optional shared_name attribute to value. -// If not specified, defaults to "" -func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr { - return func(m optionalAttr) { - m["shared_name"] = value - } -} - -// Op peeks at the values at the specified key. If the -// -// underlying container does not contain this key -// this op will block until it does. This Op is optimized for -// performance. -func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{"dtypes": dtypes} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "OrderedMapPeek", - Input: []tf.Input{ - key, indices, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - if scope.Err() != nil { - return - } - var idx int - var err error - if values, idx, err = makeOutputList(op, idx, "values"); err != nil { - scope.UpdateErr("OrderedMapPeek", err) - return - } - return values -} - -// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg. -type DecodeAndCropJpegAttr func(optionalAttr) - -// DecodeAndCropJpegChannels sets the optional channels attribute to value. -// -// value: Number of color channels for the decoded image. -// If not specified, defaults to 0 -func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["channels"] = value - } -} - -// DecodeAndCropJpegRatio sets the optional ratio attribute to value. -// -// value: Downscaling ratio. -// If not specified, defaults to 1 -func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["ratio"] = value - } -} - -// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value. -// -// value: If true use a slower but nicer upscaling of the -// chroma planes (yuv420/422 only). -// If not specified, defaults to true -func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["fancy_upscaling"] = value - } -} - -// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value. -// -// value: If true try to recover an image from truncated input. -// If not specified, defaults to false -func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["try_recover_truncated"] = value - } -} - -// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value. -// -// value: The minimum required fraction of lines before a truncated -// input is accepted. -// If not specified, defaults to 1 -func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["acceptable_fraction"] = value - } -} - -// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value. -// -// value: string specifying a hint about the algorithm used for -// decompression. Defaults to "" which maps to a system-specific -// default. Currently valid values are ["INTEGER_FAST", -// "INTEGER_ACCURATE"]. The hint may be ignored (e.g., the internal -// jpeg library changes to a version that does not have that specific -// option.) -// If not specified, defaults to "" -func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr { - return func(m optionalAttr) { - m["dct_method"] = value - } -} - -// Decode and Crop a JPEG-encoded image to a uint8 tensor. -// -// The attr `channels` indicates the desired number of color channels for the -// decoded image. -// -// Accepted values are: -// -// * 0: Use the number of channels in the JPEG-encoded image. -// * 1: output a grayscale image. -// * 3: output an RGB image. -// -// If needed, the JPEG-encoded image is transformed to match the requested number -// of color channels. -// -// The attr `ratio` allows downscaling the image by an integer factor during -// decoding. Allowed values are: 1, 2, 4, and 8. This is much faster than -// downscaling the image later. -// -// -// It is equivalent to a combination of decode and crop, but much faster by only -// decoding partial jpeg image. -// -// Arguments: -// contents: 0-D. The JPEG-encoded image. -// crop_window: 1-D. The crop window: [crop_y, crop_x, crop_height, crop_width]. -// -// Returns 3-D with shape `[height, width, channels]`.. -func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "DecodeAndCropJpeg", - Input: []tf.Input{ - contents, crop_window, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler. -type AllCandidateSamplerAttr func(optionalAttr) - -// AllCandidateSamplerSeed sets the optional seed attribute to value. -// -// value: If either seed or seed2 are set to be non-zero, the random number -// generator is seeded by the given seed. Otherwise, it is seeded by a -// random seed. -// If not specified, defaults to 0 -func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr { - return func(m optionalAttr) { - m["seed"] = value - } -} - -// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value. -// -// value: An second seed to avoid seed collision. -// If not specified, defaults to 0 -func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr { - return func(m optionalAttr) { - m["seed2"] = value - } -} - -// Generates labels for candidate sampling with a learned unigram distribution. -// -// See explanations of candidate sampling and the data formats at -// go/candidate-sampling. -// -// For each batch, this op picks a single set of sampled candidate labels. -// -// The advantages of sampling candidates per-batch are simplicity and the -// possibility of efficient dense matrix multiplication. The disadvantage is that -// the sampled candidates must be chosen independently of the context and of the -// true labels. -// -// Arguments: -// true_classes: A batch_size * num_true matrix, in which each row contains the -// IDs of the num_true target_classes in the corresponding original label. -// num_true: Number of true labels per context. -// num_sampled: Number of candidates to produce. -// unique: If unique is true, we sample with rejection, so that all sampled -// candidates in a batch are unique. This requires some approximation to -// estimate the post-rejection sampling probabilities. -// -// Returns A vector of length num_sampled, in which each element is -// the ID of a sampled candidate.A batch_size * num_true matrix, representing -// the number of times each candidate is expected to occur in a batch -// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled -// candidate representing the number of times the candidate is expected -// to occur in a batch of sampled candidates. If unique=true, then this is a -// probability. -func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "AllCandidateSampler", - Input: []tf.Input{ - true_classes, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0), op.Output(1), op.Output(2) -} - // Saves the input tensors to disk. // // The size of `tensor_names` must match the number of tensors in `data`. `data[i]` @@ -18997,216 +19439,6 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf return op.Output(0) } -// SkipgramAttr is an optional argument to Skipgram. -type SkipgramAttr func(optionalAttr) - -// SkipgramWindowSize sets the optional window_size attribute to value. -// -// value: The number of words to predict to the left and right of the target. -// If not specified, defaults to 5 -func SkipgramWindowSize(value int64) SkipgramAttr { - return func(m optionalAttr) { - m["window_size"] = value - } -} - -// SkipgramMinCount sets the optional min_count attribute to value. -// -// value: The minimum number of word occurrences for it to be included in the -// vocabulary. -// If not specified, defaults to 5 -func SkipgramMinCount(value int64) SkipgramAttr { - return func(m optionalAttr) { - m["min_count"] = value - } -} - -// SkipgramSubsample sets the optional subsample attribute to value. -// -// value: Threshold for word occurrence. Words that appear with higher -// frequency will be randomly down-sampled. Set to 0 to disable. -// If not specified, defaults to 0.001 -func SkipgramSubsample(value float32) SkipgramAttr { - return func(m optionalAttr) { - m["subsample"] = value - } -} - -// Parses a text file and creates a batch of examples. -// -// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result -// -// Arguments: -// filename: The corpus's text file name. -// batch_size: The size of produced batch. -// -// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids. -func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "Skipgram", - - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6) -} - -// StringToNumberAttr is an optional argument to StringToNumber. -type StringToNumberAttr func(optionalAttr) - -// StringToNumberOutType sets the optional out_type attribute to value. -// -// value: The numeric type to interpret each string in `string_tensor` as. -// If not specified, defaults to DT_FLOAT -func StringToNumberOutType(value tf.DataType) StringToNumberAttr { - return func(m optionalAttr) { - m["out_type"] = value - } -} - -// Converts each string in the input Tensor to the specified numeric type. -// -// (Note that int32 overflow results in an error while float overflow -// results in a rounded value.) -// -// Returns A Tensor of the same shape as the input `string_tensor`. -func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "StringToNumber", - Input: []tf.Input{ - string_tensor, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2. -type ResourceApplyFtrlV2Attr func(optionalAttr) - -// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value. -// -// value: If `True`, updating of the var and accum tensors will be protected -// by a lock; otherwise the behavior is undefined, but may exhibit less -// contention. -// If not specified, defaults to false -func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr { - return func(m optionalAttr) { - m["use_locking"] = value - } -} - -// Update '*var' according to the Ftrl-proximal scheme. -// -// grad_with_shrinkage = grad + 2 * l2_shrinkage * var -// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage -// linear += grad_with_shrinkage + -// (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var -// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 -// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 -// accum = accum_new -// -// Arguments: -// var_: Should be from a Variable(). -// accum: Should be from a Variable(). -// linear: Should be from a Variable(). -// grad: The gradient. -// lr: Scaling factor. Must be a scalar. -// l1: L1 regulariation. Must be a scalar. -// l2: L2 shrinkage regulariation. Must be a scalar. -// -// lr_power: Scaling factor. Must be a scalar. -// -// Returns the created operation. -func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "ResourceApplyFtrlV2", - Input: []tf.Input{ - var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power, - }, - Attrs: attrs, - } - return scope.AddOperation(opspec) -} - -// TruncatedNormalAttr is an optional argument to TruncatedNormal. -type TruncatedNormalAttr func(optionalAttr) - -// TruncatedNormalSeed sets the optional seed attribute to value. -// -// value: If either `seed` or `seed2` are set to be non-zero, the random number -// generator is seeded by the given seed. Otherwise, it is seeded by a -// random seed. -// If not specified, defaults to 0 -func TruncatedNormalSeed(value int64) TruncatedNormalAttr { - return func(m optionalAttr) { - m["seed"] = value - } -} - -// TruncatedNormalSeed2 sets the optional seed2 attribute to value. -// -// value: A second seed to avoid seed collision. -// If not specified, defaults to 0 -func TruncatedNormalSeed2(value int64) TruncatedNormalAttr { - return func(m optionalAttr) { - m["seed2"] = value - } -} - -// Outputs random values from a truncated normal distribution. -// -// The generated values follow a normal distribution with mean 0 and standard -// deviation 1, except that values whose magnitude is more than 2 standard -// deviations from the mean are dropped and re-picked. -// -// Arguments: -// shape: The shape of the output tensor. -// dtype: The type of the output. -// -// Returns A tensor of the specified shape filled with random truncated normal -// values. -func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{"dtype": dtype} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "TruncatedNormal", - Input: []tf.Input{ - shape, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // RandomShuffleAttr is an optional argument to RandomShuffle. type RandomShuffleAttr func(optionalAttr) @@ -19325,113 +19557,6 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or return op.Output(0) } -// DecodeRawAttr is an optional argument to DecodeRaw. -type DecodeRawAttr func(optionalAttr) - -// DecodeRawLittleEndian sets the optional little_endian attribute to value. -// -// value: Whether the input `bytes` are in little-endian order. -// Ignored for `out_type` values that are stored in a single byte like -// `uint8`. -// If not specified, defaults to true -func DecodeRawLittleEndian(value bool) DecodeRawAttr { - return func(m optionalAttr) { - m["little_endian"] = value - } -} - -// Reinterpret the bytes of a string as a vector of numbers. -// -// Arguments: -// bytes: All the elements must have the same length. -// -// -// Returns A Tensor with one more dimension than the input `bytes`. The -// added dimension will have size equal to the length of the elements -// of `bytes` divided by the number of bytes to represent `out_type`. -func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) { - if scope.Err() != nil { - return - } - attrs := map[string]interface{}{"out_type": out_type} - for _, a := range optional { - a(attrs) - } - opspec := tf.OpSpec{ - Type: "DecodeRaw", - Input: []tf.Input{ - bytes, - }, - Attrs: attrs, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - -// Copy a tensor setting everything outside a central band in each innermost matrix -// -// to zero. -// -// The `band` part is computed as follows: -// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a -// tensor with the same shape where -// -// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`. -// -// The indicator function -// -// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && -// (num_upper < 0 || (n-m) <= num_upper)`. -// -// For example: -// -// ``` -// # if 'input' is [[ 0, 1, 2, 3] -// [-1, 0, 1, 2] -// [-2, -1, 0, 1] -// [-3, -2, -1, 0]], -// -// tf.matrix_band_part(input, 1, -1) ==> [[ 0, 1, 2, 3] -// [-1, 0, 1, 2] -// [ 0, -1, 0, 1] -// [ 0, 0, -1, 0]], -// -// tf.matrix_band_part(input, 2, 1) ==> [[ 0, 1, 0, 0] -// [-1, 0, 1, 0] -// [-2, -1, 0, 1] -// [ 0, -2, -1, 0]] -// ``` -// -// Useful special cases: -// -// ``` -// tf.matrix_band_part(input, 0, -1) ==> Upper triangular part. -// tf.matrix_band_part(input, -1, 0) ==> Lower triangular part. -// tf.matrix_band_part(input, 0, 0) ==> Diagonal. -// ``` -// -// Arguments: -// input: Rank `k` tensor. -// num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire -// lower triangle. -// num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep -// entire upper triangle. -// -// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor. -func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "MatrixBandPart", - Input: []tf.Input{ - input, num_lower, num_upper, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Counts the number of occurrences of each value in an integer array. // // Outputs a vector with length `size` and the same dtype as `weights`. If @@ -21159,7 +21284,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr { // generated sequentially as '*tag*/image/0', '*tag*/image/1', etc. // // The `bad_color` argument is the color to use in the generated images for -// non-finite input values. It is a `uint8` 1-D tensor of length `channels`. +// non-finite input values. It is a `unit8` 1-D tensor of length `channels`. // Each element must be in the range `[0, 255]` (It represents the value of a // pixel in the output image). Non-finite values in the input tensor are // replaced by this tensor in the output image. The default value is the color @@ -30569,128 +30694,3 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values op := scope.AddOperation(opspec) return op.Output(0) } - -// Gather slices from `params` into a Tensor with shape specified by `indices`. -// -// `indices` is an K-dimensional integer tensor, best thought of as a -// (K-1)-dimensional tensor of indices into `params`, where each element defines a -// slice of `params`: -// -// output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]] -// -// Whereas in @{tf.gather} `indices` defines slices into the first -// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the -// first `N` dimensions of `params`, where `N = indices.shape[-1]`. -// -// The last dimension of `indices` can be at most the rank of -// `params`: -// -// indices.shape[-1] <= params.rank -// -// The last dimension of `indices` corresponds to elements -// (if `indices.shape[-1] == params.rank`) or slices -// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]` -// of `params`. The output tensor has shape -// -// indices.shape[:-1] + params.shape[indices.shape[-1]:] -// -// Note that on CPU, if an out of bound index is found, an error is returned. -// On GPU, if an out of bound index is found, a 0 is stored in the -// corresponding output value. -// -// Some examples below. -// -// Simple indexing into a matrix: -// -// ```python -// indices = [[0, 0], [1, 1]] -// params = [['a', 'b'], ['c', 'd']] -// output = ['a', 'd'] -// ``` -// -// Slice indexing into a matrix: -// -// ```python -// indices = [[1], [0]] -// params = [['a', 'b'], ['c', 'd']] -// output = [['c', 'd'], ['a', 'b']] -// ``` -// -// Indexing into a 3-tensor: -// -// ```python -// indices = [[1]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = [[['a1', 'b1'], ['c1', 'd1']]] -// -// -// indices = [[0, 1], [1, 0]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = [['c0', 'd0'], ['a1', 'b1']] -// -// -// indices = [[0, 0, 1], [1, 0, 1]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = ['b0', 'b1'] -// ``` -// -// Batched indexing into a matrix: -// -// ```python -// indices = [[[0, 0]], [[0, 1]]] -// params = [['a', 'b'], ['c', 'd']] -// output = [['a'], ['b']] -// ``` -// -// Batched slice indexing into a matrix: -// -// ```python -// indices = [[[1]], [[0]]] -// params = [['a', 'b'], ['c', 'd']] -// output = [[['c', 'd']], [['a', 'b']]] -// ``` -// -// Batched indexing into a 3-tensor: -// -// ```python -// indices = [[[1]], [[0]]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = [[[['a1', 'b1'], ['c1', 'd1']]], -// [[['a0', 'b0'], ['c0', 'd0']]]] -// -// indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = [[['c0', 'd0'], ['a1', 'b1']], -// [['a0', 'b0'], ['c1', 'd1']]] -// -// -// indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]] -// params = [[['a0', 'b0'], ['c0', 'd0']], -// [['a1', 'b1'], ['c1', 'd1']]] -// output = [['b0', 'b1'], ['d0', 'c1']] -// ``` -// -// Arguments: -// params: The tensor from which to gather values. -// indices: Index tensor. -// -// Returns Values from `params` gathered from indices given by `indices`, with -// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`. -func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "GatherNd", - Input: []tf.Input{ - params, indices, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} From 8fa27b1903ceedb25da5649aa17160866dda734d Mon Sep 17 00:00:00 2001 From: Billy Lamberta Date: Mon, 23 Apr 2018 22:08:52 -0700 Subject: [PATCH 0647/1734] docs: Clean up install_linux with pip --- tensorflow/docs_src/install/install_linux.md | 440 +++++++++---------- 1 file changed, 200 insertions(+), 240 deletions(-) diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index b7b0fc7d3db..9b431e49eeb 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -103,37 +103,196 @@ the specified versions. If upgrading is not possible, then you may still run TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}. -## Determine how to install TensorFlow +## How to install TensorFlow -You must pick the mechanism by which you install TensorFlow. The -supported choices are as follows: +There are a few options to install TensorFlow on your machine: - * [Virtualenv](#InstallingVirtualenv) - * ["native" pip](#InstallingNativePip) - * [Docker](#InstallingDocker) - * [Anaconda](#InstallingAnaconda) - * installing from sources, which is documented in - [a separate guide](https://www.tensorflow.org/install/install_sources). +* [Use pip in a virtual environment](#InstallingVirtualenv) *(recommended)* +* [Use pip in your system environment](#InstallingNativePip) +* [Configure a Docker container](#InstallingDocker) +* [Use pip in Anaconda](#InstallingAnaconda) +* [Install TensorFlow from source](/install/install_sources) -**We recommend the Virtualenv installation.** -[Virtualenv](https://virtualenv.pypa.io/en/stable/) -is a virtual Python environment isolated from other Python development, -incapable of interfering with or being affected by other Python programs -on the same machine. During the Virtualenv installation process, -you will install not only TensorFlow but also all the packages that -TensorFlow requires. (This is actually pretty easy.) -To start working with TensorFlow, you simply need to "activate" the -virtual environment. All in all, Virtualenv provides a safe and -reliable mechanism for installing and running TensorFlow. + +### Use `pip` in a virtual environment -Native pip installs TensorFlow directly on your system without going -through any container system. **We recommend the native pip install for -system administrators aiming to make TensorFlow available to everyone on a -multi-user system.** Since a native pip installation is not walled-off in -a separate container, the pip installation might interfere with other -Python-based installations on your system. However, if you understand pip -and your Python environment, a native pip installation often entails only -a single command. +This is the *recommended* install method. The +[Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual Python +environments that are isolated from other Python development on the same machine. +In this scenario, you install TensorFlow and its dependencies within a virtual +environment that is available when *activated*. Virtualenv provides a reliable +way to install and run TensorFlow while avoiding conflicts with the rest of the +system. + +1\. On Ubuntu, install the `pip` and `virtualenv` packages: + +
+  sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7
+  sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n
+
+ +2\. Create a directory for the virtual environment and choose a Python +interpreter: + +
+  mkdir ~/tensorflow  # somewhere to work out of
+  cd ~/tensorflow
+  # Choose one of the following Python environments for the ./venv directory:
+  virtualenv --system-site-packages venv            # Use python default (Python 2.7)
+  virtualenv --system-site-packages -p python3 venv # Use Python 3.n
+
+ +3\. Activate the Virtualenv environment using one of these shell commands: + +
+  source ~/tensorflow/venv/bin/activate      # bash, sh, ksh, or zsh
+  source ~/tensorflow/venv/bin/activate.csh  # csh or tcsh
+  . ~/tensorflow/venv/bin/activate.fish      # fish
+
+ +When the Virtualenv is activated, the shell prompt displays as `(venv) $`. + +4\. Upgrade `pip` in your virtual environment: + +See the [pip installation guide](https://pip.pypa.io/en/stable/installing/) for +instructions, or use `easy_install`: + +
+(venv)$ easy_install -U pip
+
+ +5\. Within an active Virtualenv environment, use one of the following `pip` +commands to install the TensorFlow package: + +
+(venv)$ pip install --upgrade tensorflow      # for Python 2.7
+(venv)$ pip3 install --upgrade tensorflow     # for Python 3.n
+(venv)$ pip install --upgrade tensorflow-gpu  # for Python 2.7 and GPU
+
+ +Success! TensorFlow is now installed. + +Use `pip list` to show the packages installed in the virtual environment. +[Validate the install](#ValidateYourInstallation) and test the version: + +
+(venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
+
+ +Use the `deactivate` command to stop the Python virtual environment. + +#### Problems + +If the above steps failed, try installing the TensorFlow binary using the remote +URL of the `pip` package: + +
+(venv)$ pip install --upgrade remote-pkg-URL   # Python 2.7
+(venv)$ pip3 install --upgrade remote-pkg-URL  # Python 3.n
+
+ +The remote-pkg-URL depends on the operating system, Python version, +and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the +URL naming scheme and location. + +See [Common Installation Problems](#common_installation_problems) if you +encounter problems. + +#### Uninstall TensorFlow + +To uninstall TensorFlow, remove the Virtualenv directory you created in step 2: + +
+  deactivate  # stop the virtualenv
+  rm -r ~/tensorflow/venv
+
+ + + +### Use `pip` in your system environment + +Use `pip` to install the TensorFlow package directly on your system without +using a container or virtual environment for isolation. This method is +recommended for system administrators that want a TensorFlow installation that is +available to everyone on a multi-user system. + +Since a system install is not isolated, it could interfere with other +Python-based installations. But if you understand `pip` and your Python +environment, a system `pip` install is straightforward. + +See the +[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) +for a list of TensorFlow packages that `pip` installs or upgrade`. + + +#### Install Python and `pip` + +On Ubuntu, Python is automatically installed and `pip` is *usually* installed. +Confirm the `python` and `pip` versions: + +
+  python -V
+  pip -V  # or: pip3 -V
+
+ +We *strongly recommend* `pip` or `pip3` version 8.1 or higher. If using a release +before version 8.1, upgrade `pip`: + +
+  sudo apt-get install python-pip python-dev   # for Python 2.7
+  sudo apt-get install python3-pip python3-dev # for Python 3.n
+
+ + +#### Install TensorFlow + +Install one of the available TensorFlow packages: + +
+  # Select one:
+  sudo pip install tensorflow      # Python 2.7 CPU (no GPU support)
+  sudo pip3 install tensorflow     # Python 3.n CPU (no GPU support)
+  sudo pip install tensorflow-gpu  # Python 2.7 GPU support
+  sudo pip3 install tensorflow-gpu # Python 3.n GPU support
+
+ +Success! TensorFlow is now installed. + +Use `pip list` to show the packages installed on the system. +[Validate the install](#ValidateYourInstallation) and test the version: + +
+  python -c "import tensorflow as tf; print(tf.__version__)"
+
+ +#### Problems + +If the above steps failed, try installing the TensorFlow binary using the remote +URL of the `pip` package: + +
+  sudo pip install --upgrade remote-pkg-URL   # Python 2.7
+  sudo pip3 install --upgrade remote-pkg-URL  # Python 3.n
+
+ +The remote-pkg-URL depends on the operating system, Python version, +and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the +URL naming scheme and location. + +See [Common Installation Problems](#common_installation_problems) if you +encounter problems. + +#### Uninstall TensorFlow + +To uninstall TensorFlow on your system, use one of following commands: + +
+  sudo pip uninstall tensorflow   # for Python 2.7
+  sudo pip3 uninstall tensorflow  # for Python 3.n
+
+ + +### Configure a Docker container Docker completely isolates the TensorFlow installation from pre-existing packages on your machine. The Docker container contains @@ -142,210 +301,6 @@ large (hundreds of MBs). You might choose the Docker installation if you are incorporating TensorFlow into a larger application architecture that already uses Docker. -In Anaconda, you may use conda to create a virtual environment. -However, within Anaconda, we recommend installing TensorFlow with the -`pip install` command, not with the `conda install` command. - -**NOTE:** The conda package is community supported, not officially supported. -That is, the TensorFlow team neither tests nor maintains the conda package. -Use that package at your own risk. - - - -## Installing with Virtualenv - -Take the following steps to install TensorFlow with Virtualenv: - - 1. Install pip and Virtualenv by issuing one of the following commands: - -
$ sudo apt-get install python-pip python-dev python-virtualenv # for Python 2.7
-    $ sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n
- - 2. Create a Virtualenv environment by issuing one of the following commands: - -
$ virtualenv --system-site-packages targetDirectory # for Python 2.7
-    $ virtualenv --system-site-packages -p python3 targetDirectory # for Python 3.n
- - where targetDirectory specifies the top of the - Virtualenv tree. Our instructions assume that - targetDirectory is `~/tensorflow`, but you may - choose any directory. - - 3. Activate the Virtualenv environment by issuing one of the following - commands: - -
$ source ~/tensorflow/bin/activate # bash, sh, ksh, or zsh
-    $ source ~/tensorflow/bin/activate.csh  # csh or tcsh
-    $ . ~/tensorflow/bin/activate.fish  # fish
- - The preceding source command should change your prompt - to the following: - -
(tensorflow)$ 
- - 4. Ensure pip ≥8.1 is installed: - -
(tensorflow)$ easy_install -U pip
- - 5. Issue one of the following commands to install TensorFlow in the active - Virtualenv environment: - -
(tensorflow)$ pip install --upgrade tensorflow      # for Python 2.7
-    (tensorflow)$ pip3 install --upgrade tensorflow     # for Python 3.n
-    (tensorflow)$ pip install --upgrade tensorflow-gpu  # for Python 2.7 and GPU
-    (tensorflow)$ pip3 install --upgrade tensorflow-gpu # for Python 3.n and GPU
- - If the above command succeeds, skip Step 6. If the preceding - command fails, perform Step 6. - - 6. (Optional) If Step 5 failed (typically because you invoked a pip version - lower than 8.1), install TensorFlow in the active Virtualenv environment - by issuing a command of the following format: - -
(tensorflow)$ pip install --upgrade tfBinaryURL   # Python 2.7
-    (tensorflow)$ pip3 install --upgrade tfBinaryURL  # Python 3.n 
- - where tfBinaryURL identifies the URL of the - TensorFlow Python package. The appropriate value of - tfBinaryURLdepends on the operating system, - Python version, and GPU support. Find the appropriate value for - tfBinaryURL for your system - [here](#the_url_of_the_tensorflow_python_package). For example, if you - are installing TensorFlow for Linux, Python 3.4, and CPU-only support, - issue the following command to install TensorFlow in the active - Virtualenv environment: - -
(tensorflow)$ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
- -If you encounter installation problems, see -[Common Installation Problems](#common_installation_problems). - - -### Next Steps - -After installing TensorFlow, -[validate the installation](#ValidateYourInstallation). - -Note that you must activate the Virtualenv environment each time you -use TensorFlow. If the Virtualenv environment is not currently active, -invoke one of the following commands: - -
$ source ~/tensorflow/bin/activate      # bash, sh, ksh, or zsh
-$ source ~/tensorflow/bin/activate.csh  # csh or tcsh
- -When the Virtualenv environment is active, you may run -TensorFlow programs from this shell. Your prompt will become -the following to indicate that your tensorflow environment is active: - -
(tensorflow)$ 
- -When you are done using TensorFlow, you may deactivate the -environment by invoking the `deactivate` function as follows: - -
(tensorflow)$ deactivate 
- -The prompt will revert back to your default prompt (as defined by the -`PS1` environment variable). - - -### Uninstalling TensorFlow - -To uninstall TensorFlow, simply remove the tree you created. -For example: - -
$ rm -r targetDirectory 
- - - -## Installing with native pip - -You may install TensorFlow through pip, choosing between a simple -installation procedure or a more complex one. - -**Note:** The -[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) -lists the TensorFlow packages that pip will install or upgrade. - - -### Prerequisite: Python and Pip - -Python is automatically installed on Ubuntu. Take a moment to confirm -(by issuing a `python -V` command) that one of the following Python -versions is already installed on your system: - - * Python 2.7 - * Python 3.4+ - -The pip or pip3 package manager is *usually* installed on Ubuntu. Take a -moment to confirm (by issuing a `pip -V` or `pip3 -V` command) -that pip or pip3 is installed. We strongly recommend version 8.1 or higher -of pip or pip3. If Version 8.1 or later is not installed, issue the -following command, which will either install or upgrade to the latest -pip version: - -
$ sudo apt-get install python-pip python-dev   # for Python 2.7
-$ sudo apt-get install python3-pip python3-dev # for Python 3.n
-
- - -### Install TensorFlow - -Assuming the prerequisite software is installed on your Linux host, -take the following steps: - - 1. Install TensorFlow by invoking **one** of the following commands: - -
$ pip install tensorflow      # Python 2.7; CPU support (no GPU support)
-    $ pip3 install tensorflow     # Python 3.n; CPU support (no GPU support)
-    $ pip install tensorflow-gpu  # Python 2.7;  GPU support
-    $ pip3 install tensorflow-gpu # Python 3.n; GPU support 
- - If the preceding command runs to completion, you should now - [validate your installation](#ValidateYourInstallation). - - 2. (Optional.) If Step 1 failed, install the latest version of TensorFlow - by issuing a command of the following format: - -
$ sudo pip  install --upgrade tfBinaryURL   # Python 2.7
-    $ sudo pip3 install --upgrade tfBinaryURL   # Python 3.n 
- - where tfBinaryURL identifies the URL of the - TensorFlow Python package. The appropriate value of - tfBinaryURL depends on the operating system, - Python version, and GPU support. Find the appropriate value for - tfBinaryURL - [here](#the_url_of_the_tensorflow_python_package). For example, to - install TensorFlow for Linux, Python 3.4, and CPU-only support, issue - the following command: - -
-     $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
-     
- - If this step fails, see - [Common Installation Problems](#common_installation_problems). - - -### Next Steps - -After installing TensorFlow, [validate your installation](#ValidateYourInstallation). - - -### Uninstalling TensorFlow - -To uninstall TensorFlow, issue one of following commands: - -
-$ sudo pip uninstall tensorflow  # for Python 2.7
-$ sudo pip3 uninstall tensorflow # for Python 3.n
-
- - - -## Installing with Docker - Take the following steps to install TensorFlow through Docker: 1. Install Docker on your machine as described in the @@ -364,7 +319,7 @@ Take the following steps to install TensorFlow through Docker: The remainder of this section explains how to launch a Docker container. -### CPU-only +#### CPU-only To launch a Docker container with CPU-only support (that is, without GPU support), enter a command of the following format: @@ -414,7 +369,7 @@ $ docker run -it -p 8888:8888 tensorflow/tensorflow Docker will download the TensorFlow binary image the first time you launch it. -### GPU support +#### GPU support Prior to installing TensorFlow with GPU support, ensure that your system meets all [NVIDIA software requirements](#NVIDIARequirements). To launch a Docker container @@ -470,14 +425,22 @@ For more details see the [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker). -### Next Steps +#### Next Steps You should now [validate your installation](#ValidateYourInstallation). -## Installing with Anaconda +### Use `pip` in Anaconda + +Anaconda provides the `conda` utility to create a virtual environment. However, +within Anaconda, we recommend installing TensorFlow using the `pip install` +command and *not* with the `conda install` command. + +Caution: `conda` is a community supported package this is not officially +maintained by the TensorFlow team. Use this package at your own risk since it is +not tested on new TensorFlow releases. Take the following steps to install TensorFlow in an Anaconda environment: @@ -563,10 +526,7 @@ installation problems](#common_installation_problems). If you are new to machine learning, we recommend the following: * [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course) -* @{$get_started/get_started_for_beginners$Getting Started for ML Beginners} - -If you are experienced with machine learning but new to TensorFlow, see -@{$get_started/premade_estimators$Getting Started with TensorFlow}. +* @{$get_started/eager} ## Common installation problems @@ -581,7 +541,7 @@ ask a new question about it on Stack Overflow and specify the `tensorflow` tag.
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.8.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.8.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.7.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.7.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.6.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
- + From 9c5c558cba9069dfedfde9431ed13227b3893bbf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 22:36:35 -0700 Subject: [PATCH 0648/1734] Make ClientLibraryTestBase::CreateScalarRelu return XlaComputation. PiperOrigin-RevId: 194036707 --- tensorflow/compiler/xla/tests/client_library_test_base.cc | 4 ++-- tensorflow/compiler/xla/tests/client_library_test_base.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc index 31c9e216441..c09a6d71c98 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.cc +++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc @@ -621,8 +621,8 @@ ClientLibraryTestBase::ComputeValueAndReference( return std::make_pair(std::move(reference), std::move(result)); } -Computation ClientLibraryTestBase::CreateScalarRelu() { - ComputationBuilder builder(client_, "relu"); +XlaComputation ClientLibraryTestBase::CreateScalarRelu() { + XlaBuilder builder("relu"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); auto z_value = builder.Parameter(0, shape, "z_value"); auto zero = use_bfloat16_ diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index 85ebe29ae97..c303a4562eb 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -255,7 +255,7 @@ class ClientLibraryTestBase : public ::testing::Test { ErrorSpec error); // Create scalar operations for use in reductions. - Computation CreateScalarRelu(); + XlaComputation CreateScalarRelu(); Computation CreateScalarMax(); Computation CreateScalarReluSensitivity(); From d75f2bf9041c7d50c932e48a175c9d5ab0bd0075 Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Mon, 23 Apr 2018 22:36:39 -0700 Subject: [PATCH 0649/1734] Internal change PiperOrigin-RevId: 194036710 --- .../eager/python/examples/resnet50/BUILD | 11 ++++++ .../python/examples/resnet50/resnet50_test.py | 34 ++++++++++--------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD index 536cad998d9..0c0e28dd95c 100644 --- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD +++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD @@ -14,6 +14,17 @@ py_library( ], ) +py_library( + name = "resnet50_test_lib", + srcs = ["resnet50_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":resnet50", + "//tensorflow:tensorflow_py", + "//tensorflow/contrib/eager/python:tfe", + ], +) + cuda_py_test( name = "resnet50_test", size = "large", diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py index d6923293a37..09a0cd88d87 100644 --- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py +++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py @@ -36,8 +36,8 @@ def device_and_data_format(): 'channels_last') -def random_batch(batch_size): - _, data_format = device_and_data_format() +def random_batch(batch_size, device_and_format=None): + _, data_format = device_and_format or device_and_data_format() shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3) shape = (batch_size,) + shape @@ -184,22 +184,23 @@ class ResNet50Benchmarks(tf.test.Benchmark): def _report(self, label, start, num_iters, device, batch_size, data_format): avg_time = (time.time() - start) / num_iters - dev = 'cpu' if 'cpu' in device else 'gpu' + dev = tf.DeviceSpec.from_string(device).device_type.lower() name = '%s_%s_batch_%d_%s' % (label, dev, batch_size, data_format) extras = {'examples_per_sec': batch_size / avg_time} self.report_benchmark( iters=num_iters, wall_time=avg_time, name=name, extras=extras) - def _force_gpu_sync(self): - # If this function is called in the context of a GPU device + def _force_device_sync(self): + # If this function is called in the context of a non-CPU device # (e.g., inside a 'with tf.device("/gpu:0")' block) - # then this will force a copy from CPU->GPU->CPU, which forces - # a sync. This is a roundabout way, yes. + # then this will force a copy from CPU->NON_CPU_DEVICE->CPU, + # which forces a sync. This is a roundabout way, yes. tf.constant(1.).cpu() - def _benchmark_eager_apply(self, label, defun=False, execution_mode=None): + def _benchmark_eager_apply(self, label, defun=False, execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() model = resnet50.ResNet50(data_format) if defun: model.call = tfe.defun(model.call) @@ -207,7 +208,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): num_burn = 5 num_iters = 30 with tf.device(device): - images, _ = random_batch(batch_size) + images, _ = random_batch(batch_size, device_and_format) for _ in xrange(num_burn): model(images, training=False).cpu() if execution_mode: @@ -220,7 +221,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): tfe.async_wait() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_apply(self): + def benchmark_eager_apply_sync(self): self._benchmark_eager_apply('eager_apply', defun=False) def benchmark_eager_apply_async(self): @@ -234,11 +235,12 @@ class ResNet50Benchmarks(tf.test.Benchmark): label, make_iterator, defun=False, - execution_mode=None): + execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() for batch_size in self._train_batch_sizes(): - (images, labels) = random_batch(batch_size) + (images, labels) = random_batch(batch_size, device_and_format) num_burn = 3 num_iters = 10 model = resnet50.ResNet50(data_format) @@ -253,7 +255,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() gc.collect() start = time.time() @@ -262,7 +264,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format) def benchmark_eager_train(self): From 969be44f38d566b46b2d8a15958fd10db2b108fb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 23 Apr 2018 23:18:11 -0700 Subject: [PATCH 0650/1734] Update ops-related pbtxt files. PiperOrigin-RevId: 194039856 --- .../core/ops/compat/ops_history.v1.pbtxt | 194 ++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 194 ++++++++++++++++++ 2 files changed, 388 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 247f9edf5b2..05dee30ca07 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -1534,6 +1534,85 @@ op { } } } +op { + name: "ApplyAdaMax" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "m" + type_attr: "T" + is_ref: true + } + input_arg { + name: "v" + type_attr: "T" + is_ref: true + } + input_arg { + name: "beta1_power" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "beta1" + type_attr: "T" + } + input_arg { + name: "beta2" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} op { name: "ApplyAdadelta" input_arg { @@ -11234,6 +11313,38 @@ op { } } } +op { + name: "BroadcastTo" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "shape" + type_attr: "Tidx" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + } + attr { + name: "Tidx" + type: "type" + default_value { + type: DT_INT32 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "Bucketize" input_arg { @@ -42885,6 +42996,78 @@ op { } } } +op { + name: "ResourceApplyAdaMax" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "m" + type: DT_RESOURCE + } + input_arg { + name: "v" + type: DT_RESOURCE + } + input_arg { + name: "beta1_power" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "beta1" + type_attr: "T" + } + input_arg { + name: "beta2" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + is_stateful: true +} op { name: "ResourceApplyAdadelta" input_arg { @@ -66434,6 +66617,17 @@ op { } } } +op { + name: "StringStrip" + input_arg { + name: "input" + type: DT_STRING + } + output_arg { + name: "output" + type: DT_STRING + } +} op { name: "StringToHashBucket" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index d1773daebe4..2edd15c446b 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -684,6 +684,85 @@ op { } } } +op { + name: "ApplyAdaMax" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "m" + type_attr: "T" + is_ref: true + } + input_arg { + name: "v" + type_attr: "T" + is_ref: true + } + input_arg { + name: "beta1_power" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "beta1" + type_attr: "T" + } + input_arg { + name: "beta2" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} op { name: "ApplyAdadelta" input_arg { @@ -4388,6 +4467,38 @@ op { } } } +op { + name: "BroadcastTo" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "shape" + type_attr: "Tidx" + } + output_arg { + name: "output" + type_attr: "T" + } + attr { + name: "T" + type: "type" + } + attr { + name: "Tidx" + type: "type" + default_value { + type: DT_INT32 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "Bucketize" input_arg { @@ -21487,6 +21598,78 @@ op { } } } +op { + name: "ResourceApplyAdaMax" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "m" + type: DT_RESOURCE + } + input_arg { + name: "v" + type: DT_RESOURCE + } + input_arg { + name: "beta1_power" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "beta1" + type_attr: "T" + } + input_arg { + name: "beta2" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + is_stateful: true +} op { name: "ResourceApplyAdadelta" input_arg { @@ -30483,6 +30666,17 @@ op { } } } +op { + name: "StringStrip" + input_arg { + name: "input" + type: DT_STRING + } + output_arg { + name: "output" + type: DT_STRING + } +} op { name: "StringToHashBucket" input_arg { From aab0ef354b628ff4d88ab7f90b2d5bdcc440b6de Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Tue, 24 Apr 2018 00:15:19 -0700 Subject: [PATCH 0651/1734] Internal Change PiperOrigin-RevId: 194043623 --- .../eager/python/examples/resnet50/resnet50_test.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py index 09a0cd88d87..8517a3bf7b6 100644 --- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py +++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py @@ -169,7 +169,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): def _train_batch_sizes(self): """Choose batch sizes based on GPU capability.""" for device in device_lib.list_local_devices(): - if 'GPU:0' in device.name: + if tf.DeviceSpec.from_string(device.name).device_type == 'GPU': # Avoid OOM errors with larger batch sizes, which seem to cause errors # later on even if caught. # @@ -180,6 +180,11 @@ class ResNet50Benchmarks(tf.test.Benchmark): return (16,) if 'P100' in device.physical_device_desc: return (16, 32, 64) + + if tf.DeviceSpec.from_string(device.name).device_type == 'TPU': + # TODO(iga): Training fails with batch size of 16, probably because of + # no layout optimizations with op-by-op mode. Investigate more. + return (8,) return (16, 32) def _report(self, label, start, num_iters, device, batch_size, data_format): @@ -267,7 +272,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_train(self): + def benchmark_eager_train_sync(self): self._benchmark_eager_train('eager_train', MockIterator, defun=False) def benchmark_eager_train_async(self): From 8f20757e9bff4e2f2cdaf1a2e655eb7e0c17b68c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 02:00:06 -0700 Subject: [PATCH 0652/1734] Moving the Var class to framework so that it can be part of framework_headers_lib and accessible from contrib. PiperOrigin-RevId: 194054227 --- tensorflow/core/framework/resource_var.h | 58 ++++++++++++++++++++++++ tensorflow/core/kernels/variable_ops.h | 34 +------------- 2 files changed, 59 insertions(+), 33 deletions(-) create mode 100644 tensorflow/core/framework/resource_var.h diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h new file mode 100644 index 00000000000..872b8f8b304 --- /dev/null +++ b/tensorflow/core/framework/resource_var.h @@ -0,0 +1,58 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_ +#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_ + +#include "tensorflow/core/framework/resource_mgr.h" + +namespace tensorflow { + +// Resource stored by variables in the resource manager +// (new, resource-style version). +class Var : public ResourceBase { + public: + explicit Var(DataType dtype) : tensor_(dtype) {} + // Not copyable or movable. + Var(const Var&) = delete; + Var& operator=(const Var&) = delete; + + // TODO(ebrevdo): Use LockSet instead of exposing mu. + mutex* mu() { return &mu_; } + Tensor* tensor() { return &tensor_; } + + string DebugString() override { + return strings::StrCat(DataTypeString(tensor_.dtype()), "/", + tensor_.shape().DebugString()); + } + + // Only used in the resource variable path. In resource variables, + // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while + // there is not a good value there due to a race condition, and it's possible + // to stumble upon this during variable.initialized_value(). So it's best to + // just store directly whether the variable is initialized. + bool is_initialized = false; // GUARDED_BY(mu_) but annotalysis doesn't like + // it. + + private: + mutex mu_; + Tensor tensor_; + + ~Var() override {} +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_VAR_H_ diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h index 8b406e5311c..f27dab4ddda 100644 --- a/tensorflow/core/kernels/variable_ops.h +++ b/tensorflow/core/kernels/variable_ops.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/resource_var.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mutex.h" @@ -27,39 +28,6 @@ limitations under the License. namespace tensorflow { -// Resource stored by variables in the resource manager -// (new, resource-style version). -class Var : public ResourceBase { - public: - explicit Var(DataType dtype) : tensor_(dtype) {} - // Not copyable or movable. - Var(const Var&) = delete; - Var& operator=(const Var&) = delete; - - // TODO(ebrevdo): Use LockSet instead of exposing mu. - mutex* mu() { return &mu_; } - Tensor* tensor() { return &tensor_; } - - string DebugString() override { - return strings::StrCat(DataTypeString(tensor_.dtype()), "/", - tensor_.shape().DebugString()); - } - - // Only used in the resource variable path. In resource variables, - // tensor.IsInitialized() can be true (i.e. have memory allocated to it) while - // there is not a good value there due to a race condition, and it's possible - // to stumble upon this during variable.initialized_value(). So it's best to - // just store directly whether the variable is initialized. - bool is_initialized = false; // GUARDED_BY(mu_) but annotalysis doesn't like - // it. - - private: - mutex mu_; - Tensor tensor_; - - ~Var() override {} -}; - class VariableOp : public OpKernel { public: explicit VariableOp(OpKernelConstruction* context); From 7ea8e98a9ecf5ad8c23a8df220126f6addbdf2af Mon Sep 17 00:00:00 2001 From: Sagi Date: Tue, 24 Apr 2018 17:36:49 +0800 Subject: [PATCH 0653/1734] Update README.md Awesome and details doc! But I wouldn't call it an "awkward" package path :) --- tensorflow/go/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md index b1bd87eb0c3..e251356ec8e 100644 --- a/tensorflow/go/README.md +++ b/tensorflow/go/README.md @@ -5,7 +5,7 @@ Construct and execute TensorFlow graphs in Go. [![GoDoc](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go?status.svg)](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go) > *WARNING*: The API defined in this package is not stable and can change -> without notice. The same goes for the awkward package path +> without notice. The same goes for the package path: > (`github.com/tensorflow/tensorflow/tensorflow/go`). ## Quickstart From e74b98ba6348d869fee50b95b7795885fdedecee Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 24 Apr 2018 04:33:16 -0700 Subject: [PATCH 0654/1734] Automated g4 rollback of changelist 193718607 PiperOrigin-RevId: 194068437 --- .../core/distributed_runtime/master_session.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index e3022f38a24..83afc5b1a46 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -89,6 +89,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted { ~ReffedClientGraph() override { if (should_deregister_) { DeregisterPartitions(); + } else { + for (Part& part : partitions_) { + worker_cache_->ReleaseWorker(part.name, part.worker); + } } } @@ -1174,14 +1178,8 @@ Status MasterSession::Create(GraphDef* graph_def, TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph( graph_def, execution_options, &execution_state_)); } - // TODO(b/36574172): Remove these conditions when ClusterSpec - // propagation is supported in all servers. - if (options.cluster_def != nullptr || - session_opts_.config.isolate_session_state()) { - should_delete_worker_sessions_ = true; - return CreateWorkerSessions(options); - } - return Status::OK(); + should_delete_worker_sessions_ = true; + return CreateWorkerSessions(options); } Status MasterSession::CreateWorkerSessions( From 9f38ab74161a0e8dd0b35b47f23ddeda7b286af3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 04:35:39 -0700 Subject: [PATCH 0655/1734] Add variants of DoBlasGemmWithAlgorithm with alpha being on device. This is in preparation of allowing XLA to fuse (A dot b) * alpha where alpha can be on device instead of just a constant. PiperOrigin-RevId: 194068597 --- tensorflow/stream_executor/blas.h | 115 ++++++++------- tensorflow/stream_executor/cuda/cuda_blas.cc | 81 +++++++---- tensorflow/stream_executor/cuda/cuda_blas.h | 14 +- .../stream_executor/host_or_device_scalar.h | 56 ++++++++ tensorflow/stream_executor/stream.cc | 136 ++++++++++-------- tensorflow/stream_executor/stream.h | 68 ++++----- 6 files changed, 294 insertions(+), 176 deletions(-) create mode 100644 tensorflow/stream_executor/host_or_device_scalar.h diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 6e62b85728a..be0b0bf5fb2 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -41,9 +41,10 @@ limitations under the License. #define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_ #include -#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/host_or_device_scalar.h" #include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/platform/port.h" namespace Eigen { struct half; @@ -1032,43 +1033,49 @@ class BlasSupport { // creating a new Stream for each attempt. virtual bool DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, int alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, int beta, DeviceMemory *c, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, ComputationType computation_type, AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; virtual bool DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const Eigen::half &alpha, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, const Eigen::half &beta, - DeviceMemory *c, int ldc, ComputationType computation_type, - AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; - virtual bool DoBlasGemmWithAlgorithm( - Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, + const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, ComputationType computation_type, AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; virtual bool DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, double beta, - DeviceMemory *c, int ldc, ComputationType computation_type, - AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, ComputationType computation_type, AlgorithmType algorithm, + ProfileResult *output_profile_result) = 0; virtual bool DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex alpha, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, ComputationType computation_type, AlgorithmType algorithm, + ProfileResult *output_profile_result) = 0; + virtual bool DoBlasGemmWithAlgorithm( + Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, ComputationType computation_type, AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; virtual bool DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex alpha, + uint64 n, uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, ComputationType computation_type, AlgorithmType algorithm, ProfileResult *output_profile_result) = 0; @@ -1886,50 +1893,58 @@ class BlasSupport { override; \ bool DoBlasGemmWithAlgorithm( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, int alpha, const DeviceMemory &a, \ - int lda, const DeviceMemory &b, int ldb, int beta, \ - DeviceMemory *c, int ldc, blas::ComputationType computation_type, \ + uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar &alpha, \ + const DeviceMemory &a, int lda, const DeviceMemory &b, \ + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, \ + int ldc, blas::ComputationType computation_type, \ blas::AlgorithmType algorithm, \ blas::ProfileResult *output_profile_result) override; \ bool DoBlasGemmWithAlgorithm( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, const Eigen::half &alpha, \ + uint64 m, uint64 n, uint64 k, \ + const HostOrDeviceScalar &alpha, \ const DeviceMemory &a, int lda, \ - const DeviceMemory &b, int ldb, const Eigen::half &beta, \ + const DeviceMemory &b, int ldb, \ + const HostOrDeviceScalar &beta, \ DeviceMemory *c, int ldc, \ blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ blas::ProfileResult *output_profile_result) override; \ bool DoBlasGemmWithAlgorithm( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, float alpha, const DeviceMemory &a, \ - int lda, const DeviceMemory &b, int ldb, float beta, \ - DeviceMemory *c, int ldc, blas::ComputationType computation_type, \ - blas::AlgorithmType algorithm, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithAlgorithm( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, double alpha, \ - const DeviceMemory &a, int lda, const DeviceMemory &b, \ - int ldb, double beta, DeviceMemory *c, int ldc, \ - blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithAlgorithm( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, std::complex alpha, \ - const DeviceMemory> &a, int lda, \ - const DeviceMemory> &b, int ldb, \ - std::complex beta, DeviceMemory> *c, int ldc, \ - blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ - blas::ProfileResult *output_profile_result) override; \ - bool DoBlasGemmWithAlgorithm( \ - Stream *stream, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, std::complex alpha, \ - const DeviceMemory> &a, int lda, \ - const DeviceMemory> &b, int ldb, \ - std::complex beta, DeviceMemory> *c, \ + uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar &alpha, \ + const DeviceMemory &a, int lda, const DeviceMemory &b, \ + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, \ int ldc, blas::ComputationType computation_type, \ blas::AlgorithmType algorithm, \ blas::ProfileResult *output_profile_result) override; \ + bool DoBlasGemmWithAlgorithm( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, const HostOrDeviceScalar &alpha, \ + const DeviceMemory &a, int lda, const DeviceMemory &b, \ + int ldb, const HostOrDeviceScalar &beta, \ + DeviceMemory *c, int ldc, \ + blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ + blas::ProfileResult *output_profile_result) override; \ + bool DoBlasGemmWithAlgorithm( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, \ + const HostOrDeviceScalar> &alpha, \ + const DeviceMemory> &a, int lda, \ + const DeviceMemory> &b, int ldb, \ + const HostOrDeviceScalar> &beta, \ + DeviceMemory> *c, int ldc, \ + blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ + blas::ProfileResult *output_profile_result) override; \ + bool DoBlasGemmWithAlgorithm( \ + Stream *stream, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, \ + const HostOrDeviceScalar> &alpha, \ + const DeviceMemory> &a, int lda, \ + const DeviceMemory> &b, int ldb, \ + const HostOrDeviceScalar> &beta, \ + DeviceMemory> *c, int ldc, \ + blas::ComputationType computation_type, blas::AlgorithmType algorithm, \ + blas::ProfileResult *output_profile_result) override; \ bool DoBlasGemmBatched( \ Stream *stream, blas::Transpose transa, blas::Transpose transb, \ uint64 m, uint64 n, uint64 k, float alpha, \ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 007c0f1c86c..3c1353aee31 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -2156,10 +2156,11 @@ static bool TensorOpsAvailable(int cc_major) { template bool CUDABlas::DoBlasGemmWithAlgorithmImpl( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const CompT &alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, const CompT &beta, - DeviceMemory *c, int ldc, blas::ComputationType computation_type, - blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, + blas::ComputationType computation_type, blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result) { // CUDA < version 8 and GPUs < sm_50 don't support cublasGemmEx. #if CUDA_VERSION < 8000 return false; @@ -2175,6 +2176,12 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl( return false; } + // Either both 'alpha' and 'beta' need to be pointers to device memory, or + // they need to be both host scalars. + if (alpha.is_pointer() != beta.is_pointer()) { + return false; + } + std::unique_ptr timer; if (output_profile_result != nullptr) { timer.reset(new CUDATimer(parent_)); @@ -2187,10 +2194,15 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl( // Since we are converting 'algorithm' to cublasGemmAlgo_t by static_cast, // we do the following compile-time check on the default value: static_assert(blas::kDefaultGemmAlgo == CUBLAS_GEMM_DFALT, ""); + // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we + // essentially reinterpet_cast to __half, which is safe because Eigen::half + // inherits from __half. bool result = DoBlasInternalFailureOK( - wrap::cublasGemmEx, stream, /* pointer_mode_host = */ true, - CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, - CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb, &beta, + wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(), + CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, + alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(), + CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb, + beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(), CUDAMemoryMutable(c), CUDADataType::type, ldc, CUDAComputationType(computation_type), static_cast(algorithm)); @@ -2239,10 +2251,11 @@ bool CUDABlas::GetBlasGemmAlgorithms( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, int alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, int beta, DeviceMemory *c, - int ldc, blas::ComputationType computation_type, - blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, + blas::ComputationType computation_type, blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result) { return DoBlasGemmWithAlgorithmImpl( stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, algorithm, output_profile_result); @@ -2250,17 +2263,25 @@ bool CUDABlas::DoBlasGemmWithAlgorithm( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const Eigen::half &alpha, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, const Eigen::half &beta, - DeviceMemory *c, int ldc, - blas::ComputationType computation_type, blas::AlgorithmType algorithm, - blas::ProfileResult *output_profile_result) { + const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, + blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { if (computation_type == blas::ComputationType::kF32) { + if (alpha.is_pointer() || beta.is_pointer()) { + // We cannot easily convert a pointer to f16 memory to a pointer to f32 + // memory from here, so we don't support this for now. + // TODO(akuegel): Investigate whether we can do the conversion before + // calling DoBlasGemmWithAlgorithm. + return false; + } + HostOrDeviceScalar float_alpha(static_cast(alpha.value())); + HostOrDeviceScalar float_beta(static_cast(beta.value())); return DoBlasGemmWithAlgorithmImpl( - stream, transa, transb, m, n, k, static_cast(alpha), a, lda, b, - ldb, static_cast(beta), c, ldc, computation_type, algorithm, - output_profile_result); + stream, transa, transb, m, n, k, float_alpha, a, lda, b, ldb, + float_beta, c, ldc, computation_type, algorithm, output_profile_result); } CHECK_EQ(computation_type, blas::ComputationType::kF16); @@ -2271,8 +2292,9 @@ bool CUDABlas::DoBlasGemmWithAlgorithm( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, float alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { return DoBlasGemmWithAlgorithmImpl( @@ -2282,9 +2304,10 @@ bool CUDABlas::DoBlasGemmWithAlgorithm( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, double alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, double beta, - DeviceMemory *c, int ldc, blas::ComputationType computation_type, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { return DoBlasGemmWithAlgorithmImpl( stream, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, @@ -2293,10 +2316,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex alpha, + uint64 n, uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { return DoBlasGemmWithAlgorithmImpl( @@ -2306,10 +2330,11 @@ bool CUDABlas::DoBlasGemmWithAlgorithm( bool CUDABlas::DoBlasGemmWithAlgorithm( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, std::complex alpha, + uint64 n, uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { return DoBlasGemmWithAlgorithmImpl( diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 55c414a1f92..12dc5e47fd1 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -21,6 +21,7 @@ limitations under the License. #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_ #include "tensorflow/stream_executor/blas.h" +#include "tensorflow/stream_executor/host_or_device_scalar.h" #include "tensorflow/stream_executor/lib/stringpiece.h" #include "tensorflow/stream_executor/platform/mutex.h" #include "tensorflow/stream_executor/platform/port.h" @@ -116,18 +117,13 @@ class CUDABlas : public blas::BlasSupport { int batch_count, ScratchAllocator *scratch_allocator); // Helper function for implementing DoBlasGemmWithAlgorithm. - // - // We take alpha and beta by const reference because T might be Eigen::half, - // and we want to avoid pulling in a dependency on Eigen. When we pass the - // references to cublas, we essentially reinterpret_cast to __half, which is - // safe because Eigen::half inherits from __half. template bool DoBlasGemmWithAlgorithmImpl( Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, const CompT &alpha, const DeviceMemory &a, - int lda, const DeviceMemory &b, int ldb, const CompT &beta, - DeviceMemory *c, int ldc, blas::ComputationType computation_type, - blas::AlgorithmType algorithm, + uint64 n, uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, + blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result); // Helper function for implementing DoBlasGemmWithProfiling. diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h new file mode 100644 index 00000000000..c9e3e147783 --- /dev/null +++ b/tensorflow/stream_executor/host_or_device_scalar.h @@ -0,0 +1,56 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_ +#define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_ + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/stream_executor/device_memory.h" + +namespace stream_executor { + +// Allows to represent a value that is either a host scalar or a scalar stored +// on the GPU device. +template +class HostOrDeviceScalar { + public: + // Not marked as explicit because when using this constructor, we usually want + // to set this to a compile-time constant. + HostOrDeviceScalar(ElemT value) : value_(value), is_pointer_(false) {} + explicit HostOrDeviceScalar(const DeviceMemory& pointer) + : pointer_(pointer), is_pointer_(true) { + CHECK_EQ(1, pointer.ElementCount()); + } + + bool is_pointer() const { return is_pointer_; } + const DeviceMemory& pointer() const { + CHECK(is_pointer()); + return pointer_; + } + const ElemT& value() const { + CHECK(!is_pointer()); + return value_; + } + + private: + union { + ElemT value_; + DeviceMemory pointer_; + }; + bool is_pointer_; +}; + +} // namespace stream_executor +#endif // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_ diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index f59d9a13acf..093f0c93065 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/Eigen/Core" #include "tensorflow/stream_executor/blas.h" #include "tensorflow/stream_executor/host_buffer.h" +#include "tensorflow/stream_executor/host_or_device_scalar.h" #include "tensorflow/stream_executor/lib/stacktrace.h" #include "tensorflow/stream_executor/lib/strcat.h" #include "tensorflow/stream_executor/platform.h" @@ -133,6 +134,14 @@ string ToVlogString(float f) { return port::StrCat(f); } string ToVlogString(double d) { return port::StrCat(d); } +template +string ToVlogString(const HostOrDeviceScalar &memory_or_constant) { + if (memory_or_constant.is_pointer()) { + return ToVlogString(memory_or_constant.pointer()); + } + return ToVlogString(memory_or_constant.value()); +} + template string ToVlogString(port::ArraySlice elements) { string str = port::StrCat( @@ -3882,32 +3891,10 @@ Stream &Stream::ThenBlasGemmWithProfiling( Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, const Eigen::half &alpha, const DeviceMemory &a, - int lda, const DeviceMemory &b, int ldb, - const Eigen::half &beta, DeviceMemory *c, int ldc, - blas::ComputationType computation_type, blas::AlgorithmType algorithm, - blas::ProfileResult *output_profile_result) { - VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), - PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), - PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type), - PARAM(algorithm)); - - ThenBlasWithProfileImpl &, int, - const DeviceMemory &, int, - const Eigen::half &, DeviceMemory *, int, - blas::ComputationType, blas::AlgorithmType> - impl; - return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, - algorithm, output_profile_result); -} - -Stream &Stream::ThenBlasGemmWithAlgorithm( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, int beta, DeviceMemory *c, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), @@ -3916,8 +3903,33 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( PARAM(algorithm)); ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, int, - const DeviceMemory &, int, const DeviceMemory &, int, int, + blas::Transpose, blas::Transpose, uint64, uint64, uint64, + const HostOrDeviceScalar &, + const DeviceMemory &, int, const DeviceMemory &, + int, const HostOrDeviceScalar &, DeviceMemory *, + int, blas::ComputationType, blas::AlgorithmType> + impl; + return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, + m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, + algorithm, output_profile_result); +} + +Stream &Stream::ThenBlasGemmWithAlgorithm( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, const HostOrDeviceScalar &alpha, const DeviceMemory &a, + int lda, const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, + blas::ComputationType computation_type, blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result) { + VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), + PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), + PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type), + PARAM(algorithm)); + + ThenBlasWithProfileImpl< + blas::Transpose, blas::Transpose, uint64, uint64, uint64, + const HostOrDeviceScalar &, const DeviceMemory &, int, + const DeviceMemory &, int, const HostOrDeviceScalar &, DeviceMemory *, int, blas::ComputationType, blas::AlgorithmType> impl; return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, @@ -3927,8 +3939,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, float beta, DeviceMemory *c, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), @@ -3937,8 +3950,9 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( PARAM(algorithm)); ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, float, - const DeviceMemory &, int, const DeviceMemory &, int, float, + blas::Transpose, blas::Transpose, uint64, uint64, uint64, + const HostOrDeviceScalar &, const DeviceMemory &, int, + const DeviceMemory &, int, const HostOrDeviceScalar &, DeviceMemory *, int, blas::ComputationType, blas::AlgorithmType> impl; return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, @@ -3948,32 +3962,35 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, double alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, double beta, - DeviceMemory *c, int ldc, blas::ComputationType computation_type, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb), PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type), PARAM(algorithm)); - ThenBlasWithProfileImpl &, int, - const DeviceMemory &, int, double, - DeviceMemory *, int, blas::ComputationType, - blas::AlgorithmType> + ThenBlasWithProfileImpl< + blas::Transpose, blas::Transpose, uint64, uint64, uint64, + const HostOrDeviceScalar &, const DeviceMemory &, int, + const DeviceMemory &, int, const HostOrDeviceScalar &, + DeviceMemory *, int, blas::ComputationType, blas::AlgorithmType> impl; return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, - m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, + m, n, k, HostOrDeviceScalar(alpha), a, lda, b, ldb, + HostOrDeviceScalar(beta), c, ldc, computation_type, algorithm, output_profile_result); } Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex alpha, + uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), @@ -3981,12 +3998,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type), PARAM(algorithm)); - ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, - std::complex, const DeviceMemory> &, int, - const DeviceMemory> &, int, std::complex, - DeviceMemory> *, int, blas::ComputationType, - blas::AlgorithmType> + ThenBlasWithProfileImpl> &, + const DeviceMemory> &, int, + const DeviceMemory> &, int, + const HostOrDeviceScalar> &, + DeviceMemory> *, int, + blas::ComputationType, blas::AlgorithmType> impl; return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, @@ -3995,10 +4014,11 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( Stream &Stream::ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex alpha, + uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) { VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k), @@ -4006,12 +4026,14 @@ Stream &Stream::ThenBlasGemmWithAlgorithm( PARAM(beta), PARAM(c), PARAM(ldc), PARAM(computation_type), PARAM(algorithm)); - ThenBlasWithProfileImpl< - blas::Transpose, blas::Transpose, uint64, uint64, uint64, - std::complex, const DeviceMemory> &, int, - const DeviceMemory> &, int, std::complex, - DeviceMemory> *, int, blas::ComputationType, - blas::AlgorithmType> + ThenBlasWithProfileImpl> &, + const DeviceMemory> &, int, + const DeviceMemory> &, int, + const HostOrDeviceScalar> &, + DeviceMemory> *, int, + blas::ComputationType, blas::AlgorithmType> impl; return impl(this, &blas::BlasSupport::DoBlasGemmWithAlgorithm, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, computation_type, diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index d4a81440e96..3d1b011c570 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -30,6 +30,7 @@ limitations under the License. #include "tensorflow/stream_executor/dnn.h" #include "tensorflow/stream_executor/event.h" #include "tensorflow/stream_executor/fft.h" +#include "tensorflow/stream_executor/host_or_device_scalar.h" #include "tensorflow/stream_executor/kernel.h" #include "tensorflow/stream_executor/launch_dim.h" #include "tensorflow/stream_executor/lib/array_slice.h" @@ -1422,50 +1423,53 @@ class Stream { // See BlasSupport::DoBlasGemmWithAlgorithm. Stream &ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, const Eigen::half &alpha, const DeviceMemory &a, - int lda, const DeviceMemory &b, int ldb, - const Eigen::half &beta, DeviceMemory *c, int ldc, - blas::ComputationType computation_type, blas::AlgorithmType algorithm, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int alpha, - const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, - int beta, DeviceMemory *c, int ldc, - blas::ComputationType computation_type, - blas::AlgorithmType algorithm, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithAlgorithm(blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, - uint64 k, float alpha, - const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, - float beta, DeviceMemory *c, int ldc, - blas::ComputationType computation_type, - blas::AlgorithmType algorithm, - blas::ProfileResult *output_profile_result); - Stream &ThenBlasGemmWithAlgorithm( - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, double alpha, const DeviceMemory &a, int lda, - const DeviceMemory &b, int ldb, double beta, - DeviceMemory *c, int ldc, blas::ComputationType computation_type, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, + const DeviceMemory &b, int ldb, + const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result); Stream &ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex alpha, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, + blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result); + Stream &ThenBlasGemmWithAlgorithm( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, + blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result); + Stream &ThenBlasGemmWithAlgorithm( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, const HostOrDeviceScalar &alpha, + const DeviceMemory &a, int lda, const DeviceMemory &b, + int ldb, const HostOrDeviceScalar &beta, DeviceMemory *c, + int ldc, blas::ComputationType computation_type, + blas::AlgorithmType algorithm, + blas::ProfileResult *output_profile_result); + Stream &ThenBlasGemmWithAlgorithm( + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result); Stream &ThenBlasGemmWithAlgorithm( blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, std::complex alpha, + uint64 k, const HostOrDeviceScalar> &alpha, const DeviceMemory> &a, int lda, const DeviceMemory> &b, int ldb, - std::complex beta, DeviceMemory> *c, int ldc, + const HostOrDeviceScalar> &beta, + DeviceMemory> *c, int ldc, blas::ComputationType computation_type, blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result); From f62c472c470aee64147df58de584f0b8450b29ad Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Tue, 24 Apr 2018 06:08:14 -0700 Subject: [PATCH 0656/1734] Move LinearOperatorCirculant to third_party. PiperOrigin-RevId: 194075622 --- tensorflow/contrib/linalg/__init__.py | 4 + tensorflow/python/kernel_tests/linalg/BUILD | 20 + .../linalg/linear_operator_circulant_test.py | 700 +++++++++++ tensorflow/python/ops/linalg/linalg.py | 1 + .../ops/linalg/linear_operator_circulant.py | 1074 +++++++++++++++++ ...ear-operator-circulant.__metaclass__.pbtxt | 14 + ...ow.linalg.-linear-operator-circulant.pbtxt | 155 +++ ...-operator-circulant2-d.__metaclass__.pbtxt | 14 + ...linalg.-linear-operator-circulant2-d.pbtxt | 155 +++ ...-operator-circulant3-d.__metaclass__.pbtxt | 14 + ...linalg.-linear-operator-circulant3-d.pbtxt | 155 +++ .../tools/api/golden/tensorflow.linalg.pbtxt | 12 + 12 files changed, 2318 insertions(+) create mode 100644 tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py create mode 100644 tensorflow/python/ops/linalg/linear_operator_circulant.py create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt create mode 100644 tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py index 38bd66b13f7..554854da847 100644 --- a/tensorflow/contrib/linalg/__init__.py +++ b/tensorflow/contrib/linalg/__init__.py @@ -18,6 +18,9 @@ See the @{$python/contrib.linalg} guide. @@LinearOperator @@LinearOperatorBlockDiag +@@LinearOperatorCirculant +@@LinearOperatorCirculant2D +@@LinearOperatorCirculant3D @@LinearOperatorDiag @@LinearOperatorIdentity @@LinearOperatorScaledIdentity @@ -39,6 +42,7 @@ from tensorflow.contrib.linalg.python.ops.linear_operator_addition import * from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import * from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import * from tensorflow.python.ops.linalg.linear_operator import * +from tensorflow.python.ops.linalg.linear_operator_circulant import * from tensorflow.python.ops.linalg.linear_operator_composition import * from tensorflow.python.ops.linalg.linear_operator_diag import * from tensorflow.python.ops.linalg.linear_operator_full_matrix import * diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD index 7ffa48b6530..faeccc8fba9 100644 --- a/tensorflow/python/kernel_tests/linalg/BUILD +++ b/tensorflow/python/kernel_tests/linalg/BUILD @@ -43,6 +43,26 @@ cuda_py_test( tags = ["noasan"], # times out b/63678675 ) +cuda_py_test( + name = "linear_operator_circulant_test", + size = "medium", + srcs = ["linear_operator_circulant_test.py"], + additional_deps = [ + "//tensorflow/python/ops/linalg", + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:spectral_ops_test_util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform_test", + ], + shard_count = 5, + tags = ["noasan"], # times out b/63678675 +) + cuda_py_test( name = "linear_operator_diag_test", size = "medium", diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py new file mode 100644 index 00000000000..e7f2f1c12bf --- /dev/null +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py @@ -0,0 +1,700 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import contextlib + +import numpy as np + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import spectral_ops_test_util +from tensorflow.python.ops.linalg import linalg +from tensorflow.python.ops.linalg import linear_operator_circulant +from tensorflow.python.ops.linalg import linear_operator_test_util +from tensorflow.python.platform import test + +rng = np.random.RandomState(0) +_to_complex = linear_operator_circulant._to_complex + + +class LinearOperatorCirculantBaseTest(object): + """Common class for circulant tests.""" + + @contextlib.contextmanager + def test_session(self, *args, **kwargs): + with test.TestCase.test_session(self, *args, **kwargs) as sess: + with spectral_ops_test_util.fft_kernel_label_map(): + yield sess + + def _shape_to_spectrum_shape(self, shape): + # If spectrum.shape = batch_shape + [N], + # this creates an operator of shape batch_shape + [N, N] + return shape[:-1] + + def _spectrum_to_circulant_1d(self, spectrum, shape, dtype): + """Creates a circulant matrix from a spectrum. + + Intentionally done in an explicit yet inefficient way. This provides a + cross check to the main code that uses fancy reshapes. + + Args: + spectrum: Float or complex `Tensor`. + shape: Python list. Desired shape of returned matrix. + dtype: Type to cast the returned matrix to. + + Returns: + Circulant (batch) matrix of desired `dtype`. + """ + spectrum = _to_complex(spectrum) + spectrum_shape = self._shape_to_spectrum_shape(shape) + domain_dimension = spectrum_shape[-1] + if not domain_dimension: + return array_ops.zeros(shape, dtype) + + # Explicitly compute the action of spectrum on basis vectors. + matrix_rows = [] + for m in range(domain_dimension): + x = np.zeros([domain_dimension]) + # x is a basis vector. + x[m] = 1.0 + fft_x = math_ops.fft(x) + h_convolve_x = math_ops.ifft(spectrum * fft_x) + matrix_rows.append(h_convolve_x) + matrix = array_ops.stack(matrix_rows, axis=-1) + return math_ops.cast(matrix, dtype) + + +class LinearOperatorCirculantTestSelfAdjointOperator( + LinearOperatorCirculantBaseTest, + linear_operator_test_util.SquareLinearOperatorDerivedClassTest): + """Test of LinearOperatorCirculant when operator is self-adjoint. + + Real spectrum <==> Self adjoint operator. + Note that when the spectrum is real, the operator may still be complex. + """ + + @property + def _dtypes_to_test(self): + # This operator will always be complex because, although the specturm is + # real, the matrix will not be real. + return [dtypes.complex64] + + def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder): + shape = build_info.shape + # For this test class, we are creating real spectrums. + # We also want the spectrum to have eigenvalues bounded away from zero. + # + # spectrum is bounded away from zero. + spectrum = linear_operator_test_util.random_sign_uniform( + shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.) + # If dtype is complex, cast spectrum to complex. The imaginary part will be + # zero, so the operator will still be self-adjoint. + spectrum = math_ops.cast(spectrum, dtype) + + if use_placeholder: + spectrum_ph = array_ops.placeholder(dtypes.complex64) + # Evaluate here because (i) you cannot feed a tensor, and (ii) + # it is random and we want the same value used for both mat and feed_dict. + spectrum = spectrum.eval() + operator = linalg.LinearOperatorCirculant( + spectrum_ph, is_self_adjoint=True, input_output_dtype=dtype) + feed_dict = {spectrum_ph: spectrum} + else: + operator = linalg.LinearOperatorCirculant( + spectrum, is_self_adjoint=True, input_output_dtype=dtype) + feed_dict = None + + mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype) + + return operator, mat, feed_dict + + def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self): + with self.test_session(): + spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64) + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtypes.complex64) + matrix = operator.to_dense() + imag_matrix = math_ops.imag(matrix) + eps = np.finfo(np.float32).eps + np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3) + + +class LinearOperatorCirculantTestHermitianSpectrum( + LinearOperatorCirculantBaseTest, + linear_operator_test_util.SquareLinearOperatorDerivedClassTest): + """Test of LinearOperatorCirculant when the spectrum is Hermitian. + + Hermitian spectrum <==> Real valued operator. We test both real and complex + dtypes here though. So in some cases the matrix will be complex but with + zero imaginary part. + """ + + @property + def _dtypes_to_test(self): + return [dtypes.float32, dtypes.complex64] + + def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder): + shape = build_info.shape + # For this test class, we are creating Hermitian spectrums. + # We also want the spectrum to have eigenvalues bounded away from zero. + # + # pre_spectrum is bounded away from zero. + pre_spectrum = linear_operator_test_util.random_uniform( + shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.) + pre_spectrum_c = _to_complex(pre_spectrum) + + # Real{IFFT[pre_spectrum]} + # = IFFT[EvenPartOf[pre_spectrum]] + # is the IFFT of something that is also bounded away from zero. + # Therefore, FFT[pre_h] would be a well-conditioned spectrum. + pre_h = math_ops.ifft(pre_spectrum_c) + + # A spectrum is Hermitian iff it is the DFT of a real convolution kernel. + # So we will make spectrum = FFT[h], for real valued h. + h = math_ops.real(pre_h) + h_c = _to_complex(h) + + spectrum = math_ops.fft(h_c) + + if use_placeholder: + spectrum_ph = array_ops.placeholder(dtypes.complex64) + # Evaluate here because (i) you cannot feed a tensor, and (ii) + # it is random and we want the same value used for both mat and feed_dict. + spectrum = spectrum.eval() + operator = linalg.LinearOperatorCirculant( + spectrum_ph, input_output_dtype=dtype) + feed_dict = {spectrum_ph: spectrum} + else: + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtype) + feed_dict = None + + mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype) + + return operator, mat, feed_dict + + def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self): + with self.test_session(): + spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64) + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtypes.complex64) + matrix = operator.to_dense() + imag_matrix = math_ops.imag(matrix) + eps = np.finfo(np.float32).eps + np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3) + + +class LinearOperatorCirculantTestNonHermitianSpectrum( + LinearOperatorCirculantBaseTest, + linear_operator_test_util.SquareLinearOperatorDerivedClassTest): + """Test of LinearOperatorCirculant when the spectrum is not Hermitian. + + Non-Hermitian spectrum <==> Complex valued operator. + We test only complex dtypes here. + """ + + @property + def _dtypes_to_test(self): + return [dtypes.complex64] + + def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder): + shape = build_info.shape + # Will be well conditioned enough to get accurate solves. + spectrum = linear_operator_test_util.random_sign_uniform( + shape=self._shape_to_spectrum_shape(shape), + dtype=dtypes.complex64, + minval=1., + maxval=2.) + + if use_placeholder: + spectrum_ph = array_ops.placeholder(dtypes.complex64) + # Evaluate here because (i) you cannot feed a tensor, and (ii) + # it is random and we want the same value used for both mat and feed_dict. + spectrum = spectrum.eval() + operator = linalg.LinearOperatorCirculant( + spectrum_ph, input_output_dtype=dtype) + feed_dict = {spectrum_ph: spectrum} + else: + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtype) + feed_dict = None + + mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype) + + return operator, mat, feed_dict + + def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self): + with self.test_session(): + spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64) + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtypes.complex64) + matrix = operator.to_dense() + imag_matrix = math_ops.imag(matrix) + eps = np.finfo(np.float32).eps + np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3) + + def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self): + with self.test_session() as sess: + spectrum = math_ops.cast([6., 4, 2], dtypes.complex64) + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtypes.complex64) + matrix, matrix_h = sess.run( + [operator.to_dense(), + linalg.adjoint(operator.to_dense())]) + self.assertAllClose(matrix, matrix_h) + operator.assert_positive_definite().run() # Should not fail + operator.assert_self_adjoint().run() # Should not fail + + def test_defining_operator_using_real_convolution_kernel(self): + with self.test_session(): + convolution_kernel = [1., 2., 1.] + spectrum = math_ops.fft( + math_ops.cast(convolution_kernel, dtypes.complex64)) + + # spectrum is shape [3] ==> operator is shape [3, 3] + # spectrum is Hermitian ==> operator is real. + operator = linalg.LinearOperatorCirculant(spectrum) + + # Allow for complex output so we can make sure it has zero imag part. + self.assertEqual(operator.dtype, dtypes.complex64) + + matrix = operator.to_dense().eval() + np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6) + + def test_hermitian_spectrum_gives_operator_with_zero_imag_part(self): + with self.test_session(): + # Make spectrum the FFT of a real convolution kernel h. This ensures that + # spectrum is Hermitian. + h = linear_operator_test_util.random_normal(shape=(3, 4)) + spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64)) + operator = linalg.LinearOperatorCirculant( + spectrum, input_output_dtype=dtypes.complex64) + matrix = operator.to_dense() + imag_matrix = math_ops.imag(matrix) + eps = np.finfo(np.float32).eps + np.testing.assert_allclose( + 0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4) + + def test_convolution_kernel_same_as_first_row_of_to_dense(self): + spectrum = [[3., 2., 1.], [2., 1.5, 1.]] + with self.test_session(): + operator = linalg.LinearOperatorCirculant(spectrum) + h = operator.convolution_kernel() + c = operator.to_dense() + + self.assertAllEqual((2, 3), h.get_shape()) + self.assertAllEqual((2, 3, 3), c.get_shape()) + self.assertAllClose(h.eval(), c.eval()[:, :, 0]) + + def test_assert_non_singular_fails_for_singular_operator(self): + spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64) + operator = linalg.LinearOperatorCirculant(spectrum) + with self.test_session(): + with self.assertRaisesOpError("Singular operator"): + operator.assert_non_singular().run() + + def test_assert_non_singular_does_not_fail_for_non_singular_operator(self): + spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64) + operator = linalg.LinearOperatorCirculant(spectrum) + with self.test_session(): + operator.assert_non_singular().run() # Should not fail + + def test_assert_positive_definite_fails_for_non_positive_definite(self): + spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64) + operator = linalg.LinearOperatorCirculant(spectrum) + with self.test_session(): + with self.assertRaisesOpError("Not positive definite"): + operator.assert_positive_definite().run() + + def test_assert_positive_definite_does_not_fail_when_pos_def(self): + spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64) + operator = linalg.LinearOperatorCirculant(spectrum) + with self.test_session(): + operator.assert_positive_definite().run() # Should not fail + + def test_real_spectrum_and_not_self_adjoint_hint_raises(self): + spectrum = [1., 2.] + with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"): + linalg.LinearOperatorCirculant(spectrum, is_self_adjoint=False) + + def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self): + spectrum = [1., 2.] + operator = linalg.LinearOperatorCirculant(spectrum) + self.assertTrue(operator.is_self_adjoint) + + +class LinearOperatorCirculant2DBaseTest(object): + """Common class for 2D circulant tests.""" + + @contextlib.contextmanager + def test_session(self, *args, **kwargs): + with test.TestCase.test_session(self, *args, **kwargs) as sess: + with spectral_ops_test_util.fft_kernel_label_map(): + yield sess + + @property + def _operator_build_infos(self): + build_info = linear_operator_test_util.OperatorBuildInfo + # non-batch operators (n, n) and batch operators. + return [ + build_info((0, 0)), + build_info((1, 1)), + build_info((1, 6, 6)), + build_info((3, 4, 4)), + build_info((2, 1, 3, 3)) + ] + + def _shape_to_spectrum_shape(self, shape): + """Get a spectrum shape that will make an operator of desired shape.""" + # This 2D block circulant operator takes a spectrum of shape + # batch_shape + [N0, N1], + # and creates and operator of shape + # batch_shape + [N0*N1, N0*N1] + if shape == (0, 0): + return (0, 0) + elif shape == (1, 1): + return (1, 1) + elif shape == (1, 6, 6): + return (1, 2, 3) + elif shape == (3, 4, 4): + return (3, 2, 2) + elif shape == (2, 1, 3, 3): + return (2, 1, 3, 1) + else: + raise ValueError("Unhandled shape: %s" % shape) + + def _spectrum_to_circulant_2d(self, spectrum, shape, dtype): + """Creates a block circulant matrix from a spectrum. + + Intentionally done in an explicit yet inefficient way. This provides a + cross check to the main code that uses fancy reshapes. + + Args: + spectrum: Float or complex `Tensor`. + shape: Python list. Desired shape of returned matrix. + dtype: Type to cast the returned matrix to. + + Returns: + Block circulant (batch) matrix of desired `dtype`. + """ + spectrum = _to_complex(spectrum) + spectrum_shape = self._shape_to_spectrum_shape(shape) + domain_dimension = spectrum_shape[-1] + if not domain_dimension: + return array_ops.zeros(shape, dtype) + + block_shape = spectrum_shape[-2:] + + # Explicitly compute the action of spectrum on basis vectors. + matrix_rows = [] + for n0 in range(block_shape[0]): + for n1 in range(block_shape[1]): + x = np.zeros(block_shape) + # x is a basis vector. + x[n0, n1] = 1.0 + fft_x = math_ops.fft2d(x) + h_convolve_x = math_ops.ifft2d(spectrum * fft_x) + # We want the flat version of the action of the operator on a basis + # vector, not the block version. + h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1]) + matrix_rows.append(h_convolve_x) + matrix = array_ops.stack(matrix_rows, axis=-1) + return math_ops.cast(matrix, dtype) + + +class LinearOperatorCirculant2DTestHermitianSpectrum( + LinearOperatorCirculant2DBaseTest, + linear_operator_test_util.SquareLinearOperatorDerivedClassTest): + """Test of LinearOperatorCirculant2D when the spectrum is Hermitian. + + Hermitian spectrum <==> Real valued operator. We test both real and complex + dtypes here though. So in some cases the matrix will be complex but with + zero imaginary part. + """ + + @property + def _dtypes_to_test(self): + return [dtypes.float32, dtypes.complex64] + + def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder): + shape = build_info.shape + # For this test class, we are creating Hermitian spectrums. + # We also want the spectrum to have eigenvalues bounded away from zero. + # + # pre_spectrum is bounded away from zero. + pre_spectrum = linear_operator_test_util.random_uniform( + shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.) + pre_spectrum_c = _to_complex(pre_spectrum) + + # Real{IFFT[pre_spectrum]} + # = IFFT[EvenPartOf[pre_spectrum]] + # is the IFFT of something that is also bounded away from zero. + # Therefore, FFT[pre_h] would be a well-conditioned spectrum. + pre_h = math_ops.ifft2d(pre_spectrum_c) + + # A spectrum is Hermitian iff it is the DFT of a real convolution kernel. + # So we will make spectrum = FFT[h], for real valued h. + h = math_ops.real(pre_h) + h_c = _to_complex(h) + + spectrum = math_ops.fft2d(h_c) + + if use_placeholder: + spectrum_ph = array_ops.placeholder(dtypes.complex64) + # Evaluate here because (i) you cannot feed a tensor, and (ii) + # it is random and we want the same value used for both mat and feed_dict. + spectrum = spectrum.eval() + operator = linalg.LinearOperatorCirculant2D( + spectrum_ph, input_output_dtype=dtype) + feed_dict = {spectrum_ph: spectrum} + else: + operator = linalg.LinearOperatorCirculant2D( + spectrum, input_output_dtype=dtype) + feed_dict = None + + mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype) + + return operator, mat, feed_dict + + +class LinearOperatorCirculant2DTestNonHermitianSpectrum( + LinearOperatorCirculant2DBaseTest, + linear_operator_test_util.SquareLinearOperatorDerivedClassTest): + """Test of LinearOperatorCirculant when the spectrum is not Hermitian. + + Non-Hermitian spectrum <==> Complex valued operator. + We test only complex dtypes here. + """ + + @property + def _dtypes_to_test(self): + return [dtypes.complex64] + + def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder): + shape = build_info.shape + # Will be well conditioned enough to get accurate solves. + spectrum = linear_operator_test_util.random_sign_uniform( + shape=self._shape_to_spectrum_shape(shape), + dtype=dtype, + minval=1., + maxval=2.) + + if use_placeholder: + spectrum_ph = array_ops.placeholder(dtypes.complex64) + # Evaluate here because (i) you cannot feed a tensor, and (ii) + # it is random and we want the same value used for both mat and feed_dict. + spectrum = spectrum.eval() + operator = linalg.LinearOperatorCirculant2D( + spectrum_ph, input_output_dtype=dtype) + feed_dict = {spectrum_ph: spectrum} + else: + operator = linalg.LinearOperatorCirculant2D( + spectrum, input_output_dtype=dtype) + feed_dict = None + + mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype) + + return operator, mat, feed_dict + + def test_real_hermitian_spectrum_gives_real_symmetric_operator(self): + with self.test_session() as sess: + # This is a real and hermitian spectrum. + spectrum = [[1., 2., 2.], [3., 4., 4.], [3., 4., 4.]] + operator = linalg.LinearOperatorCirculant(spectrum) + + matrix_tensor = operator.to_dense() + self.assertEqual(matrix_tensor.dtype, + linear_operator_circulant._DTYPE_COMPLEX) + matrix_t = array_ops.matrix_transpose(matrix_tensor) + imag_matrix = math_ops.imag(matrix_tensor) + matrix, matrix_transpose, imag_matrix = sess.run( + [matrix_tensor, matrix_t, imag_matrix]) + + np.testing.assert_allclose(0, imag_matrix, atol=1e-6) + self.assertAllClose(matrix, matrix_transpose, atol=0) + + def test_real_spectrum_gives_self_adjoint_operator(self): + with self.test_session() as sess: + # This is a real and hermitian spectrum. + spectrum = linear_operator_test_util.random_normal( + shape=(3, 3), dtype=dtypes.float32) + operator = linalg.LinearOperatorCirculant2D(spectrum) + + matrix_tensor = operator.to_dense() + self.assertEqual(matrix_tensor.dtype, + linear_operator_circulant._DTYPE_COMPLEX) + matrix_h = linalg.adjoint(matrix_tensor) + matrix, matrix_h = sess.run([matrix_tensor, matrix_h]) + self.assertAllClose(matrix, matrix_h, atol=0) + + def test_assert_non_singular_fails_for_singular_operator(self): + spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64) + operator = linalg.LinearOperatorCirculant2D(spectrum) + with self.test_session(): + with self.assertRaisesOpError("Singular operator"): + operator.assert_non_singular().run() + + def test_assert_non_singular_does_not_fail_for_non_singular_operator(self): + spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64) + operator = linalg.LinearOperatorCirculant2D(spectrum) + with self.test_session(): + operator.assert_non_singular().run() # Should not fail + + def test_assert_positive_definite_fails_for_non_positive_definite(self): + spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64) + operator = linalg.LinearOperatorCirculant2D(spectrum) + with self.test_session(): + with self.assertRaisesOpError("Not positive definite"): + operator.assert_positive_definite().run() + + def test_assert_positive_definite_does_not_fail_when_pos_def(self): + spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64) + operator = linalg.LinearOperatorCirculant2D(spectrum) + with self.test_session(): + operator.assert_positive_definite().run() # Should not fail + + def test_real_spectrum_and_not_self_adjoint_hint_raises(self): + spectrum = [[1., 2.], [3., 4]] + with self.assertRaisesRegexp(ValueError, "real.*always.*self-adjoint"): + linalg.LinearOperatorCirculant2D(spectrum, is_self_adjoint=False) + + def test_real_spectrum_auto_sets_is_self_adjoint_to_true(self): + spectrum = [[1., 2.], [3., 4]] + operator = linalg.LinearOperatorCirculant2D(spectrum) + self.assertTrue(operator.is_self_adjoint) + + def test_invalid_dtype_raises(self): + spectrum = array_ops.constant(rng.rand(2, 2, 2)) + with self.assertRaisesRegexp(TypeError, "must have dtype"): + linalg.LinearOperatorCirculant2D(spectrum) + + def test_invalid_rank_raises(self): + spectrum = array_ops.constant(np.float32(rng.rand(2))) + with self.assertRaisesRegexp(ValueError, "must have at least 2 dimensions"): + linalg.LinearOperatorCirculant2D(spectrum) + + +class LinearOperatorCirculant3DTest(test.TestCase): + """Simple test of the 3D case. See also the 1D and 2D tests.""" + + @contextlib.contextmanager + def test_session(self, *args, **kwargs): + with test.TestCase.test_session(self, *args, **kwargs) as sess: + with spectral_ops_test_util.fft_kernel_label_map(): + yield sess + + def test_real_spectrum_gives_self_adjoint_operator(self): + with self.test_session() as sess: + # This is a real and hermitian spectrum. + spectrum = linear_operator_test_util.random_normal( + shape=(2, 2, 3, 5), dtype=dtypes.float32) + operator = linalg.LinearOperatorCirculant3D(spectrum) + self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape) + + matrix_tensor = operator.to_dense() + self.assertEqual(matrix_tensor.dtype, + linear_operator_circulant._DTYPE_COMPLEX) + matrix_h = linalg.adjoint(matrix_tensor) + + matrix, matrix_h = sess.run([matrix_tensor, matrix_h]) + self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape) + self.assertAllClose(matrix, matrix_h) + + def test_defining_operator_using_real_convolution_kernel(self): + with self.test_session(): + convolution_kernel = linear_operator_test_util.random_normal( + shape=(2, 2, 3, 5), dtype=dtypes.float32) + # Convolution kernel is real ==> spectrum is Hermitian. + spectrum = math_ops.fft3d( + math_ops.cast(convolution_kernel, dtypes.complex64)) + + # spectrum is Hermitian ==> operator is real. + operator = linalg.LinearOperatorCirculant3D(spectrum) + self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape) + + # Allow for complex output so we can make sure it has zero imag part. + self.assertEqual(operator.dtype, dtypes.complex64) + matrix = operator.to_dense().eval() + self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape) + np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6) + + def test_defining_spd_operator_by_taking_real_part(self): + with self.test_session() as sess: + # S is real and positive. + s = linear_operator_test_util.random_uniform( + shape=(10, 2, 3, 4), dtype=dtypes.float32, minval=1., maxval=2.) + + # Let S = S1 + S2, the Hermitian and anti-hermitian parts. + # S1 = 0.5 * (S + S^H), S2 = 0.5 * (S - S^H), + # where ^H is the Hermitian transpose of the function: + # f(n0, n1, n2)^H := ComplexConjugate[f(N0-n0, N1-n1, N2-n2)]. + # We want to isolate S1, since + # S1 is Hermitian by construction + # S1 is real since S is + # S1 is positive since it is the sum of two positive kernels + + # IDFT[S] = IDFT[S1] + IDFT[S2] + # = H1 + H2 + # where H1 is real since it is Hermitian, + # and H2 is imaginary since it is anti-Hermitian. + ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64)) + + # Throw away H2, keep H1. + real_ifft_s = math_ops.real(ifft_s) + + # This is the perfect spectrum! + # spectrum = DFT[H1] + # = S1, + fft_real_ifft_s = math_ops.fft3d( + math_ops.cast(real_ifft_s, dtypes.complex64)) + + # S1 is Hermitian ==> operator is real. + # S1 is real ==> operator is self-adjoint. + # S1 is positive ==> operator is positive-definite. + operator = linalg.LinearOperatorCirculant3D(fft_real_ifft_s) + + # Allow for complex output so we can check operator has zero imag part. + self.assertEqual(operator.dtype, dtypes.complex64) + matrix, matrix_t = sess.run([ + operator.to_dense(), + array_ops.matrix_transpose(operator.to_dense()) + ]) + operator.assert_positive_definite().run() # Should not fail. + np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6) + self.assertAllClose(matrix, matrix_t) + + # Just to test the theory, get S2 as well. + # This should create an imaginary operator. + # S2 is anti-Hermitian ==> operator is imaginary. + # S2 is real ==> operator is self-adjoint. + imag_ifft_s = math_ops.imag(ifft_s) + fft_imag_ifft_s = math_ops.fft3d( + 1j * math_ops.cast(imag_ifft_s, dtypes.complex64)) + operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s) + + matrix, matrix_h = sess.run([ + operator_imag.to_dense(), + array_ops.matrix_transpose(math_ops.conj(operator_imag.to_dense())) + ]) + self.assertAllClose(matrix, matrix_h) + np.testing.assert_allclose(0, np.real(matrix), atol=1e-7) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py index 14319025ff2..d73c21cdc0b 100644 --- a/tensorflow/python/ops/linalg/linalg.py +++ b/tensorflow/python/ops/linalg/linalg.py @@ -22,6 +22,7 @@ from __future__ import print_function # pylint: disable=wildcard-import,unused-import from tensorflow.python.ops.linalg.linalg_impl import * from tensorflow.python.ops.linalg.linear_operator import * +from tensorflow.python.ops.linalg.linear_operator_circulant import * from tensorflow.python.ops.linalg.linear_operator_composition import * from tensorflow.python.ops.linalg.linear_operator_diag import * from tensorflow.python.ops.linalg.linear_operator_full_matrix import * diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py new file mode 100644 index 00000000000..c367ed25ad6 --- /dev/null +++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py @@ -0,0 +1,1074 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""`LinearOperator` coming from a [[nested] block] circulant matrix.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops.distributions import util as distribution_util +from tensorflow.python.ops.linalg import linalg_impl as linalg +from tensorflow.python.ops.linalg import linear_operator +from tensorflow.python.ops.linalg import linear_operator_util +from tensorflow.python.util.tf_export import tf_export + +__all__ = [ + "LinearOperatorCirculant", + "LinearOperatorCirculant2D", + "LinearOperatorCirculant3D", +] + +# Different FFT Ops will be used for different block depths. +_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d} +_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d} + +# This is the only dtype allowed with fft ops. +# TODO(langmore) Add other types once available. +_DTYPE_COMPLEX = dtypes.complex64 + + +# TODO(langmore) Add transformations that create common spectrums, e.g. +# starting with the convolution kernel +# start with half a spectrum, and create a Hermitian one. +# common filters. +# TODO(langmore) Support rectangular Toeplitz matrices. +class _BaseLinearOperatorCirculant(linear_operator.LinearOperator): + """Base class for circulant operators. Not user facing. + + `LinearOperator` acting like a [batch] [[nested] block] circulant matrix. + """ + + def __init__(self, + spectrum, + block_depth, + input_output_dtype=_DTYPE_COMPLEX, + is_non_singular=None, + is_self_adjoint=None, + is_positive_definite=None, + is_square=True, + name="LinearOperatorCirculant"): + r"""Initialize an `_BaseLinearOperatorCirculant`. + + Args: + spectrum: Shape `[B1,...,Bb, N]` `Tensor`. Allowed dtypes are + `float32`, `complex64`. Type can be different than `input_output_dtype` + block_depth: Python integer, either 1, 2, or 3. Will be 1 for circulant, + 2 for block circulant, and 3 for nested block circulant. + input_output_dtype: `dtype` for input/output. Must be either + `float32` or `complex64`. + is_non_singular: Expect that this operator is non-singular. + is_self_adjoint: Expect that this operator is equal to its hermitian + transpose. If `spectrum` is real, this will always be true. + is_positive_definite: Expect that this operator is positive definite, + meaning the quadratic form `x^H A x` has positive real part for all + nonzero `x`. Note that we do not require the operator to be + self-adjoint to be positive-definite. See: + https://en.wikipedia.org/wiki/Positive-definite_matrix\ + #Extension_for_non_symmetric_matrices + is_square: Expect that this operator acts like square [batch] matrices. + name: A name to prepend to all ops created by this class. + + Raises: + ValueError: If `block_depth` is not an allowed value. + TypeError: If `spectrum` is not an allowed type. + """ + + allowed_block_depths = [1, 2, 3] + + self._name = name + + if block_depth not in allowed_block_depths: + raise ValueError("Expected block_depth to be in %s. Found: %s." % + (allowed_block_depths, block_depth)) + self._block_depth = block_depth + + with ops.name_scope(name, values=[spectrum]): + self._spectrum = self._check_spectrum_and_return_tensor(spectrum) + + # Check and auto-set hints. + if not self.spectrum.dtype.is_complex: + if is_self_adjoint is False: + raise ValueError( + "A real spectrum always corresponds to a self-adjoint operator.") + is_self_adjoint = True + + if is_square is False: + raise ValueError( + "A [[nested] block] circulant operator is always square.") + is_square = True + + # If spectrum.shape = [s0, s1, s2], and block_depth = 2, + # block_shape = [s1, s2] + s_shape = array_ops.shape(self.spectrum) + self._block_shape_tensor = s_shape[-self.block_depth:] + + # Add common variants of spectrum to the graph. + self._spectrum_complex = _to_complex(self.spectrum) + self._abs_spectrum = math_ops.abs(self.spectrum) + self._conj_spectrum = math_ops.conj(self._spectrum_complex) + + super(_BaseLinearOperatorCirculant, self).__init__( + dtype=dtypes.as_dtype(input_output_dtype), + graph_parents=[self.spectrum], + is_non_singular=is_non_singular, + is_self_adjoint=is_self_adjoint, + is_positive_definite=is_positive_definite, + is_square=is_square, + name=name) + + def _check_spectrum_and_return_tensor(self, spectrum): + """Static check of spectrum. Then return `Tensor` version.""" + spectrum = ops.convert_to_tensor(spectrum, name="spectrum") + + allowed_dtypes = [dtypes.float32, dtypes.complex64] + if spectrum.dtype not in allowed_dtypes: + raise TypeError("Argument spectrum must have dtype in %s. Found: %s" % + (allowed_dtypes, spectrum.dtype)) + if spectrum.get_shape().ndims is not None: + if spectrum.get_shape().ndims < self.block_depth: + raise ValueError( + "Argument spectrum must have at least %d dimensions. Found: %s" % + (self.block_depth, spectrum)) + return spectrum + + @property + def block_depth(self): + """Depth of recursively defined circulant blocks defining this `Operator`. + + With `A` the dense representation of this `Operator`, + + `block_depth = 1` means `A` is symmetric circulant. For example, + + ``` + A = |x y z y| + |y x y z| + |z y x y| + |y z y x| + ``` + + `block_depth = 2` means `A` is block symmetric circulant with symemtric + circulant blocks. For example, with `X`, `Y`, `Z` symmetric circulant, + + ``` + A = |X Y Z Y| + |Y X Y Z| + |Z Y X Y| + |Y Z Y X| + ``` + + `block_depth = 3` means `A` is block symmetric circulant with block + symmetric circulant blocks. + + Returns: + Python `integer`. + """ + return self._block_depth + + def block_shape_tensor(self): + """Shape of the block dimensions of `self.spectrum`.""" + return self._block_shape_tensor + + @property + def block_shape(self): + return self.spectrum.get_shape()[-self.block_depth:] + + @property + def spectrum(self): + return self._spectrum + + def _vectorize_then_blockify(self, matrix): + """Shape batch matrix to batch vector, then blockify trailing dimensions.""" + # Suppose + # matrix.shape = [m0, m1, m2, m3], + # and matrix is a matrix because the final two dimensions are matrix dims. + # self.block_depth = 2, + # self.block_shape = [b0, b1] (note b0 * b1 = m2). + # We will reshape matrix to + # [m3, m0, m1, b0, b1]. + + # Vectorize: Reshape to batch vector. + # [m0, m1, m2, m3] --> [m3, m0, m1, m2] + # This is called "vectorize" because we have taken the final two matrix dims + # and turned this into a size m3 batch of vectors. + vec = distribution_util.rotate_transpose(matrix, shift=1) + + # Blockify: Blockfy trailing dimensions. + # [m3, m0, m1, m2] --> [m3, m0, m1, b0, b1] + if (vec.get_shape().is_fully_defined() and + self.block_shape.is_fully_defined()): + # vec_leading_shape = [m3, m0, m1], + # the parts of vec that will not be blockified. + vec_leading_shape = vec.get_shape()[:-1] + final_shape = vec_leading_shape.concatenate(self.block_shape) + else: + vec_leading_shape = array_ops.shape(vec)[:-1] + final_shape = array_ops.concat( + (vec_leading_shape, self.block_shape_tensor()), 0) + return array_ops.reshape(vec, final_shape) + + def _unblockify_then_matricize(self, vec): + """Flatten the block dimensions then reshape to a batch matrix.""" + # Suppose + # vec.shape = [v0, v1, v2, v3], + # self.block_depth = 2. + # Then + # leading shape = [v0, v1] + # block shape = [v2, v3]. + # We will reshape vec to + # [v1, v2*v3, v0]. + + # Un-blockify: Flatten block dimensions. Reshape + # [v0, v1, v2, v3] --> [v0, v1, v2*v3]. + if vec.get_shape().is_fully_defined(): + # vec_shape = [v0, v1, v2, v3] + vec_shape = vec.get_shape().as_list() + # vec_leading_shape = [v0, v1] + vec_leading_shape = vec_shape[:-self.block_depth] + # vec_block_shape = [v2, v3] + vec_block_shape = vec_shape[-self.block_depth:] + # flat_shape = [v0, v1, v2*v3] + flat_shape = vec_leading_shape + [np.prod(vec_block_shape)] + else: + vec_shape = array_ops.shape(vec) + vec_leading_shape = vec_shape[:-self.block_depth] + vec_block_shape = vec_shape[-self.block_depth:] + flat_shape = array_ops.concat( + (vec_leading_shape, [math_ops.reduce_prod(vec_block_shape)]), 0) + vec_flat = array_ops.reshape(vec, flat_shape) + + # Matricize: Reshape to batch matrix. + # [v0, v1, v2*v3] --> [v1, v2*v3, v0], + # representing a shape [v1] batch of [v2*v3, v0] matrices. + matrix = distribution_util.rotate_transpose(vec_flat, shift=-1) + return matrix + + def _fft(self, x): + """FFT along the last self.block_depth dimensions of x. + + Args: + x: `Tensor` with floating or complex `dtype`. + Should be in the form returned by self._vectorize_then_blockify. + + Returns: + `Tensor` with `dtype` `complex64`. + """ + x_complex = _to_complex(x) + return _FFT_OP[self.block_depth](x_complex) + + def _ifft(self, x): + """IFFT along the last self.block_depth dimensions of x. + + Args: + x: `Tensor` with floating or complex dtype. Should be in the form + returned by self._vectorize_then_blockify. + + Returns: + `Tensor` with `dtype` `complex64`. + """ + x_complex = _to_complex(x) + return _IFFT_OP[self.block_depth](x_complex) + + def convolution_kernel(self, name="convolution_kernel"): + """Convolution kernel corresponding to `self.spectrum`. + + The `D` dimensional DFT of this kernel is the frequency domain spectrum of + this operator. + + Args: + name: A name to give this `Op`. + + Returns: + `Tensor` with `dtype` `self.dtype`. + """ + with self._name_scope(name): + h = self._ifft(self._spectrum_complex) + return math_ops.cast(h, self.dtype) + + def _shape(self): + s_shape = self._spectrum.get_shape() + # Suppose spectrum.shape = [a, b, c, d] + # block_depth = 2 + # Then: + # batch_shape = [a, b] + # N = c*d + # and we want to return + # [a, b, c*d, c*d] + batch_shape = s_shape[:-self.block_depth] + # trailing_dims = [c, d] + trailing_dims = s_shape[-self.block_depth:] + if trailing_dims.is_fully_defined(): + n = np.prod(trailing_dims.as_list()) + else: + n = None + n_x_n = tensor_shape.TensorShape([n, n]) + return batch_shape.concatenate(n_x_n) + + def _shape_tensor(self): + # See self.shape for explanation of steps + s_shape = array_ops.shape(self._spectrum) + batch_shape = s_shape[:-self.block_depth] + trailing_dims = s_shape[-self.block_depth:] + n = math_ops.reduce_prod(trailing_dims) + n_x_n = [n, n] + return array_ops.concat((batch_shape, n_x_n), 0) + + def assert_hermitian_spectrum(self, name="assert_hermitian_spectrum"): + """Returns an `Op` that asserts this operator has Hermitian spectrum. + + This operator corresponds to a real-valued matrix if and only if its + spectrum is Hermitian. + + Args: + name: A name to give this `Op`. + + Returns: + An `Op` that asserts this operator has Hermitian spectrum. + """ + eps = np.finfo(self.dtype.real_dtype.as_numpy_dtype).eps + with self._name_scope(name): + # Assume linear accumulation of error. + max_err = eps * self.domain_dimension_tensor() + imag_convolution_kernel = math_ops.imag(self.convolution_kernel()) + return check_ops.assert_less( + math_ops.abs(imag_convolution_kernel), + max_err, + message="Spectrum was not Hermitian") + + def _assert_non_singular(self): + return linear_operator_util.assert_no_entries_with_modulus_zero( + self.spectrum, + message="Singular operator: Spectrum contained zero values.") + + def _assert_positive_definite(self): + # This operator has the action Ax = F^H D F x, + # where D is the diagonal matrix with self.spectrum on the diag. Therefore, + # = , + # Since F is bijective, the condition for positive definite is the same as + # for a diagonal matrix, i.e. real part of spectrum is positive. + message = ( + "Not positive definite: Real part of spectrum was not all positive.") + return check_ops.assert_positive( + math_ops.real(self.spectrum), message=message) + + def _assert_self_adjoint(self): + # Recall correspondence between symmetry and real transforms. See docstring + return linear_operator_util.assert_zero_imag_part( + self.spectrum, + message=( + "Not self-adjoint: The spectrum contained non-zero imaginary part." + )) + + def _broadcast_batch_dims(self, x, spectrum): + """Broadcast batch dims of batch matrix `x` and spectrum.""" + # spectrum.shape = batch_shape + block_shape + # First make spectrum a batch matrix with + # spectrum.shape = batch_shape + [prod(block_shape), 1] + spec_mat = array_ops.reshape( + spectrum, array_ops.concat( + (self.batch_shape_tensor(), [-1, 1]), axis=0)) + # Second, broadcast, possibly requiring an addition of array of zeros. + x, spec_mat = linear_operator_util.broadcast_matrix_batch_dims((x, + spec_mat)) + # Third, put the block shape back into spectrum. + batch_shape = array_ops.shape(x)[:-2] + spectrum = array_ops.reshape( + spec_mat, + array_ops.concat((batch_shape, self.block_shape_tensor()), axis=0)) + + return x, spectrum + + def _matmul(self, x, adjoint=False, adjoint_arg=False): + x = linalg.adjoint(x) if adjoint_arg else x + # With F the matrix of a DFT, and F^{-1}, F^H the inverse and Hermitian + # transpose, one can show that F^{-1} = F^{H} is the IDFT matrix. Therefore + # matmul(x) = F^{-1} diag(spectrum) F x, + # = F^{H} diag(spectrum) F x, + # so that + # matmul(x, adjoint=True) = F^{H} diag(conj(spectrum)) F x. + spectrum = self._conj_spectrum if adjoint else self._spectrum_complex + + x, spectrum = self._broadcast_batch_dims(x, spectrum) + + x_vb = self._vectorize_then_blockify(x) + fft_x_vb = self._fft(x_vb) + block_vector_result = self._ifft(spectrum * fft_x_vb) + y = self._unblockify_then_matricize(block_vector_result) + + return math_ops.cast(y, self.dtype) + + def _determinant(self): + reduction_indices = [-(i + 1) for i in range(self.block_depth)] + det = math_ops.reduce_prod( + self.spectrum, reduction_indices=reduction_indices) + return math_ops.cast(det, self.dtype) + + def _log_abs_determinant(self): + reduction_indices = [-(i + 1) for i in range(self.block_depth)] + lad = math_ops.reduce_sum( + math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices) + return math_ops.cast(lad, self.dtype) + + def _solve(self, rhs, adjoint=False, adjoint_arg=False): + rhs = linalg.adjoint(rhs) if adjoint_arg else rhs + spectrum = self._conj_spectrum if adjoint else self._spectrum_complex + + rhs, spectrum = self._broadcast_batch_dims(rhs, spectrum) + + rhs_vb = self._vectorize_then_blockify(rhs) + fft_rhs_vb = self._fft(rhs_vb) + solution_vb = self._ifft(fft_rhs_vb / spectrum) + x = self._unblockify_then_matricize(solution_vb) + return math_ops.cast(x, self.dtype) + + def _diag_part(self): + # Get ones in shape of diag, which is [B1,...,Bb, N] + # Also get the size of the diag, "N". + if self.shape.is_fully_defined(): + diag_shape = self.shape[:-1] + diag_size = self.domain_dimension.value + else: + diag_shape = self.shape_tensor()[:-1] + diag_size = self.domain_dimension_tensor() + ones_diag = array_ops.ones(diag_shape, dtype=self.dtype) + + # As proved in comments in self._trace, the value on the diag is constant, + # repeated N times. This value is the trace divided by N. + + # The handling of self.shape = (0, 0) is tricky, and is the reason we choose + # to compute trace and use that to compute diag_part, rather than computing + # the value on the diagonal ("diag_value") directly. Both result in a 0/0, + # but in different places, and the current method gives the right result in + # the end. + + # Here, if self.shape = (0, 0), then self.trace() = 0., and then + # diag_value = 0. / 0. = NaN. + diag_value = self.trace() / math_ops.cast(diag_size, self.dtype) + + # If self.shape = (0, 0), then ones_diag = [] (empty tensor), and then + # the following line is NaN * [] = [], as needed. + return diag_value[..., array_ops.newaxis] * ones_diag + + def _trace(self): + # The diagonal of the [[nested] block] circulant operator is the mean of + # the spectrum. + # Proof: For the [0,...,0] element, this follows from the IDFT formula. + # Then the result follows since all diagonal elements are the same. + + # Therefore, the trace is the sum of the spectrum. + + # Get shape of diag along with the axis over which to reduce the spectrum. + # We will reduce the spectrum over all block indices. + if self.spectrum.get_shape().is_fully_defined(): + spec_rank = self.spectrum.get_shape().ndims + axis = np.arange(spec_rank - self.block_depth, spec_rank, dtype=np.int32) + else: + spec_rank = array_ops.rank(self.spectrum) + axis = math_ops.range(spec_rank - self.block_depth, spec_rank) + + # Real diag part "re_d". + # Suppose spectrum.shape = [B1,...,Bb, N1, N2] + # self.shape = [B1,...,Bb, N, N], with N1 * N2 = N. + # re_d_value.shape = [B1,...,Bb] + re_d_value = math_ops.reduce_sum(math_ops.real(self.spectrum), axis=axis) + + if not self.dtype.is_complex: + return math_ops.cast(re_d_value, self.dtype) + + # Imaginary part, "im_d". + if self.is_self_adjoint: + im_d_value = 0. + else: + im_d_value = math_ops.reduce_sum(math_ops.imag(self.spectrum), axis=axis) + + return math_ops.cast(math_ops.complex(re_d_value, im_d_value), self.dtype) + + +@tf_export("linalg.LinearOperatorCirculant") +class LinearOperatorCirculant(_BaseLinearOperatorCirculant): + """`LinearOperator` acting like a circulant matrix. + + This operator acts like a circulant matrix `A` with + shape `[B1,...,Bb, N, N]` for some `b >= 0`. The first `b` indices index a + batch member. For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is + an `N x N` matrix. This matrix `A` is not materialized, but for + purposes of broadcasting this shape will be relevant. + + #### Description in terms of circulant matrices + + Circulant means the entries of `A` are generated by a single vector, the + convolution kernel `h`: `A_{mn} := h_{m-n mod N}`. With `h = [w, x, y, z]`, + + ``` + A = |w z y x| + |x w z y| + |y x w z| + |z y x w| + ``` + + This means that the result of matrix multiplication `v = Au` has `Lth` column + given circular convolution between `h` with the `Lth` column of `u`. + + See http://ee.stanford.edu/~gray/toeplitz.pdf + + #### Description in terms of the frequency spectrum + + There is an equivalent description in terms of the [batch] spectrum `H` and + Fourier transforms. Here we consider `A.shape = [N, N]` and ignore batch + dimensions. Define the discrete Fourier transform (DFT) and its inverse by + + ``` + DFT[ h[n] ] = H[k] := sum_{n = 0}^{N - 1} h_n e^{-i 2pi k n / N} + IDFT[ H[k] ] = h[n] = N^{-1} sum_{k = 0}^{N - 1} H_k e^{i 2pi k n / N} + ``` + + From these definitions, we see that + + ``` + H[0] = sum_{n = 0}^{N - 1} h_n + H[1] = "the first positive frequency" + H[N - 1] = "the first negative frequency" + ``` + + Loosely speaking, with `*` element-wise multiplication, matrix multiplication + is equal to the action of a Fourier multiplier: `A u = IDFT[ H * DFT[u] ]`. + Precisely speaking, given `[N, R]` matrix `u`, let `DFT[u]` be the `[N, R]` + matrix with `rth` column equal to the DFT of the `rth` column of `u`. + Define the `IDFT` similarly. + Matrix multiplication may be expressed columnwise: + + ```(A u)_r = IDFT[ H * (DFT[u])_r ]``` + + #### Operator properties deduced from the spectrum. + + Letting `U` be the `kth` Euclidean basis vector, and `U = IDFT[u]`. + The above formulas show that`A U = H_k * U`. We conclude that the elements + of `H` are the eigenvalues of this operator. Therefore + + * This operator is positive definite if and only if `Real{H} > 0`. + + A general property of Fourier transforms is the correspondence between + Hermitian functions and real valued transforms. + + Suppose `H.shape = [B1,...,Bb, N]`. We say that `H` is a Hermitian spectrum + if, with `%` meaning modulus division, + + ```H[..., n % N] = ComplexConjugate[ H[..., (-n) % N] ]``` + + * This operator corresponds to a real matrix if and only if `H` is Hermitian. + * This operator is self-adjoint if and only if `H` is real. + + See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer. + + #### Example of a self-adjoint positive definite operator + + ```python + # spectrum is real ==> operator is self-adjoint + # spectrum is positive ==> operator is positive definite + spectrum = [6., 4, 2] + + operator = LinearOperatorCirculant(spectrum) + + # IFFT[spectrum] + operator.convolution_kernel() + ==> [4 + 0j, 1 + 0.58j, 1 - 0.58j] + + operator.to_dense() + ==> [[4 + 0.0j, 1 - 0.6j, 1 + 0.6j], + [1 + 0.6j, 4 + 0.0j, 1 - 0.6j], + [1 - 0.6j, 1 + 0.6j, 4 + 0.0j]] + ``` + + #### Example of defining in terms of a real convolution kernel + + ```python + # convolution_kernel is real ==> spectrum is Hermitian. + convolution_kernel = [1., 2., 1.]] + spectrum = tf.fft(tf.cast(convolution_kernel, tf.complex64)) + + # spectrum is Hermitian ==> operator is real. + # spectrum is shape [3] ==> operator is shape [3, 3] + # We force the input/output type to be real, which allows this to operate + # like a real matrix. + operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32) + + operator.to_dense() + ==> [[ 1, 1, 2], + [ 2, 1, 1], + [ 1, 2, 1]] + ``` + + #### Example of Hermitian spectrum + + ```python + # spectrum is shape [3] ==> operator is shape [3, 3] + # spectrum is Hermitian ==> operator is real. + spectrum = [1, 1j, -1j] + + operator = LinearOperatorCirculant(spectrum) + + operator.to_dense() + ==> [[ 0.33 + 0j, 0.91 + 0j, -0.24 + 0j], + [-0.24 + 0j, 0.33 + 0j, 0.91 + 0j], + [ 0.91 + 0j, -0.24 + 0j, 0.33 + 0j] + ``` + + #### Example of forcing real `dtype` when spectrum is Hermitian + + ```python + # spectrum is shape [4] ==> operator is shape [4, 4] + # spectrum is real ==> operator is self-adjoint + # spectrum is Hermitian ==> operator is real + # spectrum has positive real part ==> operator is positive-definite. + spectrum = [6., 4, 2, 4] + + # Force the input dtype to be float32. + # Cast the output to float32. This is fine because the operator will be + # real due to Hermitian spectrum. + operator = LinearOperatorCirculant(spectrum, input_output_dtype=tf.float32) + + operator.shape + ==> [4, 4] + + operator.to_dense() + ==> [[4, 1, 0, 1], + [1, 4, 1, 0], + [0, 1, 4, 1], + [1, 0, 1, 4]] + + # convolution_kernel = tf.ifft(spectrum) + operator.convolution_kernel() + ==> [4, 1, 0, 1] + ``` + + #### Performance + + Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`, + and `x.shape = [N, R]`. Then + + * `operator.matmul(x)` is `O(R*N*Log[N])` + * `operator.solve(x)` is `O(R*N*Log[N])` + * `operator.determinant()` involves a size `N` `reduce_prod`. + + If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and + `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`. + + #### Matrix property hints + + This `LinearOperator` is initialized with boolean flags of the form `is_X`, + for `X = non_singular, self_adjoint, positive_definite, square`. + These have the following meaning: + + * If `is_X == True`, callers should expect the operator to have the + property `X`. This is a promise that should be fulfilled, but is *not* a + runtime assert. For example, finite floating point precision may result + in these promises being violated. + * If `is_X == False`, callers should expect the operator to not have `X`. + * If `is_X == None` (the default), callers should have no expectation either + way. + """ + + def __init__(self, + spectrum, + input_output_dtype=_DTYPE_COMPLEX, + is_non_singular=None, + is_self_adjoint=None, + is_positive_definite=None, + is_square=True, + name="LinearOperatorCirculant"): + r"""Initialize an `LinearOperatorCirculant`. + + This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]` + by providing `spectrum`, a `[B1,...,Bb, N]` `Tensor`. + + If `input_output_dtype = DTYPE`: + + * Arguments to methods such as `matmul` or `solve` must be `DTYPE`. + * Values returned by all methods, such as `matmul` or `determinant` will be + cast to `DTYPE`. + + Note that if the spectrum is not Hermitian, then this operator corresponds + to a complex matrix with non-zero imaginary part. In this case, setting + `input_output_dtype` to a real type will forcibly cast the output to be + real, resulting in incorrect results! + + If on the other hand the spectrum is Hermitian, then this operator + corresponds to a real-valued matrix, and setting `input_output_dtype` to + a real type is fine. + + Args: + spectrum: Shape `[B1,...,Bb, N]` `Tensor`. Allowed dtypes are + `float32`, `complex64`. Type can be different than `input_output_dtype` + input_output_dtype: `dtype` for input/output. Must be either + `float32` or `complex64`. + is_non_singular: Expect that this operator is non-singular. + is_self_adjoint: Expect that this operator is equal to its hermitian + transpose. If `spectrum` is real, this will always be true. + is_positive_definite: Expect that this operator is positive definite, + meaning the quadratic form `x^H A x` has positive real part for all + nonzero `x`. Note that we do not require the operator to be + self-adjoint to be positive-definite. See: + https://en.wikipedia.org/wiki/Positive-definite_matrix\ + #Extension_for_non_symmetric_matrices + is_square: Expect that this operator acts like square [batch] matrices. + name: A name to prepend to all ops created by this class. + """ + super(LinearOperatorCirculant, self).__init__( + spectrum, + block_depth=1, + input_output_dtype=input_output_dtype, + is_non_singular=is_non_singular, + is_self_adjoint=is_self_adjoint, + is_positive_definite=is_positive_definite, + is_square=is_square, + name=name) + + +@tf_export("linalg.LinearOperatorCirculant2D") +class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant): + """`LinearOperator` acting like a block circulant matrix. + + This operator acts like a block circulant matrix `A` with + shape `[B1,...,Bb, N, N]` for some `b >= 0`. The first `b` indices index a + batch member. For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is + an `N x N` matrix. This matrix `A` is not materialized, but for + purposes of broadcasting this shape will be relevant. + + #### Description in terms of block circulant matrices + + If `A` is block circulant, with block sizes `N0, N1` (`N0 * N1 = N`): + `A` has a block circulant structure, composed of `N0 x N0` blocks, with each + block an `N1 x N1` circulant matrix. + + For example, with `W`, `X`, `Y`, `Z` each circulant, + + ``` + A = |W Z Y X| + |X W Z Y| + |Y X W Z| + |Z Y X W| + ``` + + Note that `A` itself will not in general be circulant. + + #### Description in terms of the frequency spectrum + + There is an equivalent description in terms of the [batch] spectrum `H` and + Fourier transforms. Here we consider `A.shape = [N, N]` and ignore batch + dimensions. + + If `H.shape = [N0, N1]`, (`N0 * N1 = N`): + Loosely speaking, matrix multiplication is equal to the action of a + Fourier multiplier: `A u = IDFT2[ H DFT2[u] ]`. + Precisely speaking, given `[N, R]` matrix `u`, let `DFT2[u]` be the + `[N0, N1, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, R]` and taking + a two dimensional DFT across the first two dimensions. Let `IDFT2` be the + inverse of `DFT2`. Matrix multiplication may be expressed columnwise: + + ```(A u)_r = IDFT2[ H * (DFT2[u])_r ]``` + + #### Operator properties deduced from the spectrum. + + * This operator is positive definite if and only if `Real{H} > 0`. + + A general property of Fourier transforms is the correspondence between + Hermitian functions and real valued transforms. + + Suppose `H.shape = [B1,...,Bb, N0, N1]`, we say that `H` is a Hermitian + spectrum if, with `%` indicating modulus division, + + ``` + H[..., n0 % N0, n1 % N1] = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1 ]. + ``` + + * This operator corresponds to a real matrix if and only if `H` is Hermitian. + * This operator is self-adjoint if and only if `H` is real. + + See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer. + + ### Example of a self-adjoint positive definite operator + + ```python + # spectrum is real ==> operator is self-adjoint + # spectrum is positive ==> operator is positive definite + spectrum = [[1., 2., 3.], + [4., 5., 6.], + [7., 8., 9.]] + + operator = LinearOperatorCirculant2D(spectrum) + + # IFFT[spectrum] + operator.convolution_kernel() + ==> [[5.0+0.0j, -0.5-.3j, -0.5+.3j], + [-1.5-.9j, 0, 0], + [-1.5+.9j, 0, 0]] + + operator.to_dense() + ==> Complex self adjoint 9 x 9 matrix. + ``` + + #### Example of defining in terms of a real convolution kernel, + + ```python + # convolution_kernel is real ==> spectrum is Hermitian. + convolution_kernel = [[1., 2., 1.], [5., -1., 1.]] + spectrum = tf.fft2d(tf.cast(convolution_kernel, tf.complex64)) + + # spectrum is shape [2, 3] ==> operator is shape [6, 6] + # spectrum is Hermitian ==> operator is real. + operator = LinearOperatorCirculant2D(spectrum, input_output_dtype=tf.float32) + ``` + + #### Performance + + Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`, + and `x.shape = [N, R]`. Then + + * `operator.matmul(x)` is `O(R*N*Log[N])` + * `operator.solve(x)` is `O(R*N*Log[N])` + * `operator.determinant()` involves a size `N` `reduce_prod`. + + If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and + `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`. + + #### Matrix property hints + + This `LinearOperator` is initialized with boolean flags of the form `is_X`, + for `X = non_singular, self_adjoint, positive_definite, square`. + These have the following meaning + * If `is_X == True`, callers should expect the operator to have the + property `X`. This is a promise that should be fulfilled, but is *not* a + runtime assert. For example, finite floating point precision may result + in these promises being violated. + * If `is_X == False`, callers should expect the operator to not have `X`. + * If `is_X == None` (the default), callers should have no expectation either + way. + """ + + def __init__(self, + spectrum, + input_output_dtype=_DTYPE_COMPLEX, + is_non_singular=None, + is_self_adjoint=None, + is_positive_definite=None, + is_square=True, + name="LinearOperatorCirculant2D"): + r"""Initialize an `LinearOperatorCirculant2D`. + + This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]` + by providing `spectrum`, a `[B1,...,Bb, N0, N1]` `Tensor` with `N0*N1 = N`. + + If `input_output_dtype = DTYPE`: + + * Arguments to methods such as `matmul` or `solve` must be `DTYPE`. + * Values returned by all methods, such as `matmul` or `determinant` will be + cast to `DTYPE`. + + Note that if the spectrum is not Hermitian, then this operator corresponds + to a complex matrix with non-zero imaginary part. In this case, setting + `input_output_dtype` to a real type will forcibly cast the output to be + real, resulting in incorrect results! + + If on the other hand the spectrum is Hermitian, then this operator + corresponds to a real-valued matrix, and setting `input_output_dtype` to + a real type is fine. + + Args: + spectrum: Shape `[B1,...,Bb, N]` `Tensor`. Allowed dtypes are + `float32`, `complex64`. Type can be different than `input_output_dtype` + input_output_dtype: `dtype` for input/output. Must be either + `float32` or `complex64`. + is_non_singular: Expect that this operator is non-singular. + is_self_adjoint: Expect that this operator is equal to its hermitian + transpose. If `spectrum` is real, this will always be true. + is_positive_definite: Expect that this operator is positive definite, + meaning the quadratic form `x^H A x` has positive real part for all + nonzero `x`. Note that we do not require the operator to be + self-adjoint to be positive-definite. See: + https://en.wikipedia.org/wiki/Positive-definite_matrix\ + #Extension_for_non_symmetric_matrices + is_square: Expect that this operator acts like square [batch] matrices. + name: A name to prepend to all ops created by this class. + """ + super(LinearOperatorCirculant2D, self).__init__( + spectrum, + block_depth=2, + input_output_dtype=input_output_dtype, + is_non_singular=is_non_singular, + is_self_adjoint=is_self_adjoint, + is_positive_definite=is_positive_definite, + is_square=is_square, + name=name) + + +@tf_export("linalg.LinearOperatorCirculant3D") +class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant): + """`LinearOperator` acting like a nested block circulant matrix. + + This operator acts like a block circulant matrix `A` with + shape `[B1,...,Bb, N, N]` for some `b >= 0`. The first `b` indices index a + batch member. For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is + an `N x N` matrix. This matrix `A` is not materialized, but for + purposes of broadcasting this shape will be relevant. + + #### Description in terms of block circulant matrices + + If `A` is nested block circulant, with block sizes `N0, N1, N2` + (`N0 * N1 * N2 = N`): + `A` has a block structure, composed of `N0 x N0` blocks, with each + block an `N1 x N1` block circulant matrix. + + For example, with `W`, `X`, `Y`, `Z` each block circulant, + + ``` + A = |W Z Y X| + |X W Z Y| + |Y X W Z| + |Z Y X W| + ``` + + Note that `A` itself will not in general be circulant. + + #### Description in terms of the frequency spectrum + + There is an equivalent description in terms of the [batch] spectrum `H` and + Fourier transforms. Here we consider `A.shape = [N, N]` and ignore batch + dimensions. + + If `H.shape = [N0, N1, N2]`, (`N0 * N1 * N2 = N`): + Loosely speaking, matrix multiplication is equal to the action of a + Fourier multiplier: `A u = IDFT3[ H DFT3[u] ]`. + Precisely speaking, given `[N, R]` matrix `u`, let `DFT3[u]` be the + `[N0, N1, N2, R]` `Tensor` defined by re-shaping `u` to `[N0, N1, N2, R]` and + taking a three dimensional DFT across the first three dimensions. Let `IDFT3` + be the inverse of `DFT3`. Matrix multiplication may be expressed columnwise: + + ```(A u)_r = IDFT3[ H * (DFT3[u])_r ]``` + + #### Operator properties deduced from the spectrum. + + * This operator is positive definite if and only if `Real{H} > 0`. + + A general property of Fourier transforms is the correspondence between + Hermitian functions and real valued transforms. + + Suppose `H.shape = [B1,...,Bb, N0, N1, N2]`, we say that `H` is a Hermitian + spectrum if, with `%` meaning modulus division, + + ``` + H[..., n0 % N0, n1 % N1, n2 % N2] + = ComplexConjugate[ H[..., (-n0) % N0, (-n1) % N1, (-n2) % N2] ]. + ``` + + * This operator corresponds to a real matrix if and only if `H` is Hermitian. + * This operator is self-adjoint if and only if `H` is real. + + See e.g. "Discrete-Time Signal Processing", Oppenheim and Schafer. + + ### Examples + + See `LinearOperatorCirculant` and `LinearOperatorCirculant2D` for examples. + + #### Performance + + Suppose `operator` is a `LinearOperatorCirculant` of shape `[N, N]`, + and `x.shape = [N, R]`. Then + + * `operator.matmul(x)` is `O(R*N*Log[N])` + * `operator.solve(x)` is `O(R*N*Log[N])` + * `operator.determinant()` involves a size `N` `reduce_prod`. + + If instead `operator` and `x` have shape `[B1,...,Bb, N, N]` and + `[B1,...,Bb, N, R]`, every operation increases in complexity by `B1*...*Bb`. + + #### Matrix property hints + + This `LinearOperator` is initialized with boolean flags of the form `is_X`, + for `X = non_singular, self_adjoint, positive_definite, square`. + These have the following meaning + * If `is_X == True`, callers should expect the operator to have the + property `X`. This is a promise that should be fulfilled, but is *not* a + runtime assert. For example, finite floating point precision may result + in these promises being violated. + * If `is_X == False`, callers should expect the operator to not have `X`. + * If `is_X == None` (the default), callers should have no expectation either + way. + """ + + def __init__(self, + spectrum, + input_output_dtype=_DTYPE_COMPLEX, + is_non_singular=None, + is_self_adjoint=None, + is_positive_definite=None, + is_square=True, + name="LinearOperatorCirculant3D"): + """Initialize an `LinearOperatorCirculant`. + + This `LinearOperator` is initialized to have shape `[B1,...,Bb, N, N]` + by providing `spectrum`, a `[B1,...,Bb, N0, N1, N2]` `Tensor` + with `N0*N1*N2 = N`. + + If `input_output_dtype = DTYPE`: + + * Arguments to methods such as `matmul` or `solve` must be `DTYPE`. + * Values returned by all methods, such as `matmul` or `determinant` will be + cast to `DTYPE`. + + Note that if the spectrum is not Hermitian, then this operator corresponds + to a complex matrix with non-zero imaginary part. In this case, setting + `input_output_dtype` to a real type will forcibly cast the output to be + real, resulting in incorrect results! + + If on the other hand the spectrum is Hermitian, then this operator + corresponds to a real-valued matrix, and setting `input_output_dtype` to + a real type is fine. + + Args: + spectrum: Shape `[B1,...,Bb, N]` `Tensor`. Allowed dtypes are + `float32`, `complex64`. Type can be different than `input_output_dtype` + input_output_dtype: `dtype` for input/output. Must be either + `float32` or `complex64`. + is_non_singular: Expect that this operator is non-singular. + is_self_adjoint: Expect that this operator is equal to its hermitian + transpose. If `spectrum` is real, this will always be true. + is_positive_definite: Expect that this operator is positive definite, + meaning the real part of all eigenvalues is positive. We do not require + the operator to be self-adjoint to be positive-definite. See: + https://en.wikipedia.org/wiki/Positive-definite_matrix + #Extension_for_non_symmetric_matrices + is_square: Expect that this operator acts like square [batch] matrices. + name: A name to prepend to all ops created by this class. + """ + super(LinearOperatorCirculant3D, self).__init__( + spectrum, + block_depth=3, + input_output_dtype=input_output_dtype, + is_non_singular=is_non_singular, + is_self_adjoint=is_self_adjoint, + is_positive_definite=is_positive_definite, + is_square=is_square, + name=name) + + +def _to_complex(x): + return math_ops.cast(x, _DTYPE_COMPLEX) diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt new file mode 100644 index 00000000000..3b33f3da97e --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt @@ -0,0 +1,14 @@ +path: "tensorflow.linalg.LinearOperatorCirculant.__metaclass__" +tf_class { + is_instance: "" + member_method { + name: "__init__" + } + member_method { + name: "mro" + } + member_method { + name: "register" + argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt new file mode 100644 index 00000000000..de917706d55 --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant.pbtxt @@ -0,0 +1,155 @@ +path: "tensorflow.linalg.LinearOperatorCirculant" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "batch_shape" + mtype: "" + } + member { + name: "block_depth" + mtype: "" + } + member { + name: "block_shape" + mtype: "" + } + member { + name: "domain_dimension" + mtype: "" + } + member { + name: "dtype" + mtype: "" + } + member { + name: "graph_parents" + mtype: "" + } + member { + name: "is_non_singular" + mtype: "" + } + member { + name: "is_positive_definite" + mtype: "" + } + member { + name: "is_self_adjoint" + mtype: "" + } + member { + name: "is_square" + mtype: "" + } + member { + name: "name" + mtype: "" + } + member { + name: "range_dimension" + mtype: "" + } + member { + name: "shape" + mtype: "" + } + member { + name: "spectrum" + mtype: "" + } + member { + name: "tensor_rank" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant\'], " + } + member_method { + name: "add_to_tensor" + argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], " + } + member_method { + name: "assert_hermitian_spectrum" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], " + } + member_method { + name: "assert_non_singular" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], " + } + member_method { + name: "assert_positive_definite" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], " + } + member_method { + name: "assert_self_adjoint" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], " + } + member_method { + name: "batch_shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " + } + member_method { + name: "block_shape_tensor" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "convolution_kernel" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], " + } + member_method { + name: "determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], " + } + member_method { + name: "diag_part" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], " + } + member_method { + name: "domain_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], " + } + member_method { + name: "log_abs_determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], " + } + member_method { + name: "matmul" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], " + } + member_method { + name: "matvec" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], " + } + member_method { + name: "range_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], " + } + member_method { + name: "shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], " + } + member_method { + name: "solve" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], " + } + member_method { + name: "solvevec" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], " + } + member_method { + name: "tensor_rank_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], " + } + member_method { + name: "to_dense" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], " + } + member_method { + name: "trace" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], " + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt new file mode 100644 index 00000000000..591bc9631a1 --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt @@ -0,0 +1,14 @@ +path: "tensorflow.linalg.LinearOperatorCirculant2D.__metaclass__" +tf_class { + is_instance: "" + member_method { + name: "__init__" + } + member_method { + name: "mro" + } + member_method { + name: "register" + argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt new file mode 100644 index 00000000000..c4e6a21c3ac --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt @@ -0,0 +1,155 @@ +path: "tensorflow.linalg.LinearOperatorCirculant2D" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "batch_shape" + mtype: "" + } + member { + name: "block_depth" + mtype: "" + } + member { + name: "block_shape" + mtype: "" + } + member { + name: "domain_dimension" + mtype: "" + } + member { + name: "dtype" + mtype: "" + } + member { + name: "graph_parents" + mtype: "" + } + member { + name: "is_non_singular" + mtype: "" + } + member { + name: "is_positive_definite" + mtype: "" + } + member { + name: "is_self_adjoint" + mtype: "" + } + member { + name: "is_square" + mtype: "" + } + member { + name: "name" + mtype: "" + } + member { + name: "range_dimension" + mtype: "" + } + member { + name: "shape" + mtype: "" + } + member { + name: "spectrum" + mtype: "" + } + member { + name: "tensor_rank" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant2D\'], " + } + member_method { + name: "add_to_tensor" + argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], " + } + member_method { + name: "assert_hermitian_spectrum" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], " + } + member_method { + name: "assert_non_singular" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], " + } + member_method { + name: "assert_positive_definite" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], " + } + member_method { + name: "assert_self_adjoint" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], " + } + member_method { + name: "batch_shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " + } + member_method { + name: "block_shape_tensor" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "convolution_kernel" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], " + } + member_method { + name: "determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], " + } + member_method { + name: "diag_part" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], " + } + member_method { + name: "domain_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], " + } + member_method { + name: "log_abs_determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], " + } + member_method { + name: "matmul" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], " + } + member_method { + name: "matvec" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], " + } + member_method { + name: "range_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], " + } + member_method { + name: "shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], " + } + member_method { + name: "solve" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], " + } + member_method { + name: "solvevec" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], " + } + member_method { + name: "tensor_rank_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], " + } + member_method { + name: "to_dense" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], " + } + member_method { + name: "trace" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], " + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt new file mode 100644 index 00000000000..d643139a53f --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt @@ -0,0 +1,14 @@ +path: "tensorflow.linalg.LinearOperatorCirculant3D.__metaclass__" +tf_class { + is_instance: "" + member_method { + name: "__init__" + } + member_method { + name: "mro" + } + member_method { + name: "register" + argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt new file mode 100644 index 00000000000..2e085a8e289 --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt @@ -0,0 +1,155 @@ +path: "tensorflow.linalg.LinearOperatorCirculant3D" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "batch_shape" + mtype: "" + } + member { + name: "block_depth" + mtype: "" + } + member { + name: "block_shape" + mtype: "" + } + member { + name: "domain_dimension" + mtype: "" + } + member { + name: "dtype" + mtype: "" + } + member { + name: "graph_parents" + mtype: "" + } + member { + name: "is_non_singular" + mtype: "" + } + member { + name: "is_positive_definite" + mtype: "" + } + member { + name: "is_self_adjoint" + mtype: "" + } + member { + name: "is_square" + mtype: "" + } + member { + name: "name" + mtype: "" + } + member { + name: "range_dimension" + mtype: "" + } + member { + name: "shape" + mtype: "" + } + member { + name: "spectrum" + mtype: "" + } + member { + name: "tensor_rank" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'spectrum\', \'input_output_dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\', \'None\', \'None\', \'True\', \'LinearOperatorCirculant3D\'], " + } + member_method { + name: "add_to_tensor" + argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], " + } + member_method { + name: "assert_hermitian_spectrum" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], " + } + member_method { + name: "assert_non_singular" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], " + } + member_method { + name: "assert_positive_definite" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], " + } + member_method { + name: "assert_self_adjoint" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], " + } + member_method { + name: "batch_shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " + } + member_method { + name: "block_shape_tensor" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "convolution_kernel" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], " + } + member_method { + name: "determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], " + } + member_method { + name: "diag_part" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], " + } + member_method { + name: "domain_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], " + } + member_method { + name: "log_abs_determinant" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], " + } + member_method { + name: "matmul" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], " + } + member_method { + name: "matvec" + argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], " + } + member_method { + name: "range_dimension_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], " + } + member_method { + name: "shape_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], " + } + member_method { + name: "solve" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], " + } + member_method { + name: "solvevec" + argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], " + } + member_method { + name: "tensor_rank_tensor" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], " + } + member_method { + name: "to_dense" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], " + } + member_method { + name: "trace" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], " + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt index 1d9c0c0f6d2..7a5c5338729 100644 --- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt @@ -4,6 +4,18 @@ tf_module { name: "LinearOperator" mtype: "" } + member { + name: "LinearOperatorCirculant" + mtype: "" + } + member { + name: "LinearOperatorCirculant2D" + mtype: "" + } + member { + name: "LinearOperatorCirculant3D" + mtype: "" + } member { name: "LinearOperatorComposition" mtype: "" From b9e12bc69df65eca279a90045d045e661fdb8108 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 06:24:43 -0700 Subject: [PATCH 0657/1734] Make tf.contrib.framework.zero_initializer work with ResourceVariable PiperOrigin-RevId: 194077027 --- tensorflow/contrib/framework/BUILD | 1 + .../framework/kernels/zero_initializer_op.cc | 71 +++++++++++++++++++ .../contrib/framework/ops/variable_ops.cc | 29 ++++++++ .../contrib/framework/python/ops/variables.py | 8 ++- .../framework/python/ops/variables_test.py | 26 +++++++ 5 files changed, 134 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD index b1c8ad49eaf..f675cc0cf0e 100644 --- a/tensorflow/contrib/framework/BUILD +++ b/tensorflow/contrib/framework/BUILD @@ -93,6 +93,7 @@ tf_kernel_library( ], deps = [ "//tensorflow/core:framework", + "//tensorflow/core:framework_headers_lib", "//third_party/eigen3", ], alwayslink = 1, diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc index 5bf6b675295..6ab3f460b36 100644 --- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc +++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_var.h" namespace tensorflow { @@ -85,4 +86,74 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_KERNELS +template +class ZeroVarInitializer : public OpKernel { + public: + explicit ZeroVarInitializer(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_)); + } + + void Compute(OpKernelContext* ctx) override { + Var* variable = nullptr; + OP_REQUIRES_OK(ctx, LookupOrCreateResource( + ctx, HandleFromInput(ctx, 0), &variable, + [this, ctx](Var** var_ptr) { + *var_ptr = new Var(dtype_); + PersistentTensor unused; + Tensor* var_tensor = nullptr; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_nic_compatible(true); + TF_RETURN_IF_ERROR(ctx->allocate_persistent( + dtype_, shape_, &unused, &var_tensor, attr)); + + functor::TensorSetZero()( + ctx->eigen_device(), + var_tensor->flat()); + + *(*var_ptr)->tensor() = *var_tensor; + + return Status::OK(); + })); + + core::ScopedUnref scoped(variable); + mutex_lock ml(*variable->mu()); + + OP_REQUIRES(ctx, !variable->is_initialized, + errors::InvalidArgument("input is already initialized")); + + variable->is_initialized = true; + + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output)); + output->scalar()() = HandleFromInput(ctx, 0); + } + + private: + DataType dtype_; + TensorShape shape_; +}; + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("dtype"), \ + ZeroVarInitializer); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("dtype") \ + .HostMemory("var"), \ + ZeroVarInitializer); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc index 706134ba9a5..f6ee6cdb571 100644 --- a/tensorflow/contrib/framework/ops/variable_ops.cc +++ b/tensorflow/contrib/framework/ops/variable_ops.cc @@ -39,4 +39,33 @@ ref: Should be from a `Variable` node. output_ref:= Same as "ref". )doc"); +REGISTER_OP("ZeroVarInitializer") + .Input("var: resource") + .Output("output_var: resource") + .Attr("dtype: type") + .Attr("shape: shape") + .SetAllowsUninitializedInput() + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Scalar()); + DataType t; + TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t)); + PartialTensorShape p; + TF_RETURN_IF_ERROR(c->GetAttr("shape", &p)); + shape_inference::ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s)); + c->set_output_handle_shapes_and_types( + 0, std::vector{{s, t}}); + + return Status::OK(); + }) + .Doc(R"doc( +Initialize 'var' with all zeros. This op requires that the resource var is not +initialized. The var will first be allocated memory, then be filled with all +zeros. This op is intended to save memory during initialization, +if you use this op, you should not run initializer of the var. + +var: Should be a ResourceVariable. +output_var:= Same as "var". +)doc"); + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py index 0754c3e0e30..40ae01bfcce 100644 --- a/tensorflow/contrib/framework/python/ops/variables.py +++ b/tensorflow/contrib/framework/python/ops/variables.py @@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import resource_loader from tensorflow.python.platform import tf_logging as logging @@ -82,7 +83,12 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"): """ loader.load_op_library( resource_loader.get_path_to_datafile("_variable_ops.so")) - return gen_variable_ops.zero_initializer(ref, name=name) + if resource_variable_ops.is_resource_variable(ref): + return gen_variable_ops.zero_var_initializer( + ref.handle, shape=ref.shape, dtype=ref.dtype, name=name) + else: + return gen_variable_ops.zero_initializer(ref, name=name) + @deprecated(None, "Please switch to tf.train.assert_global_step") def assert_global_step(global_step_tensor): diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py index 2f06df93acb..37ea6eb12ab 100644 --- a/tensorflow/contrib/framework/python/ops/variables_test.py +++ b/tensorflow/contrib/framework/python/ops/variables_test.py @@ -1284,6 +1284,32 @@ class ZeroInitializerOpTest(test.TestCase): [10, 20], dtype=dtype), use_init) +class ZeroVarInitializerOpTest(test.TestCase): + + def _testZeroVarInitializer(self, shape, initializer, use_init): + var = resource_variable_ops.ResourceVariable(initializer) + var_zero = variables_lib2.zero_initializer(var) + + with self.test_session() as sess: + with self.assertRaisesOpError('Error while reading resource variable'): + var.eval() + if use_init: + sess.run(var.initializer) + with self.assertRaisesOpError('input is already initialized'): + var_zero.eval() + self.assertAllClose(np.ones(shape), var.eval()) + else: + var_zero.eval() + self.assertAllClose(np.zeros(shape), var.eval()) + + def testZeroVarInitializer(self): + for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64): + for use_init in (False, True): + self._testZeroVarInitializer([10, 20], + array_ops.ones([10, 20], dtype=dtype), + use_init) + + class FilterVariablesTest(test.TestCase): def setUp(self): From 5eb233d0686636a7bacc5b8813c079b6b9aa483c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 07:06:27 -0700 Subject: [PATCH 0658/1734] Introduce a new HLO shape and sharding matcher. These new matchers can be used in tests in combination to the existing HLO opcode matchers to better verify a generated HLO graph. PiperOrigin-RevId: 194082100 --- tensorflow/compiler/xla/service/BUILD | 1 + .../compiler/xla/service/hlo_matchers.cc | 63 +++++++++++++++++ .../compiler/xla/service/hlo_matchers.h | 69 +++++++++++++++++++ .../compiler/xla/service/hlo_matchers_test.cc | 58 ++++++++++++++++ 4 files changed, 191 insertions(+) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index afb344e5ae2..5edb9440c04 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -359,6 +359,7 @@ cc_library( ":hlo", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc index bc74c4bc10c..69deac263ee 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers.cc @@ -132,6 +132,69 @@ bool HloCustomCallMatcher::MatchAndExplain( return result; } +bool HloShapeMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Compatible(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanString(shape_) << ")"; + return false; +} + +void HloShapeMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanString(shape_); +} + +bool HloShapeAndLayoutMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Equal(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanStringWithLayout(shape_) << ")"; + return false; +} + +void HloShapeAndLayoutMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanStringWithLayout(shape_); +} + +bool HloShardingMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (!sharding_.has_value()) { + if (!instruction->has_sharding()) { + return true; + } + *listener << instruction->ToString() << " expected to have no sharding."; + return false; + } + if (instruction->has_sharding()) { + if (instruction->sharding() == sharding_.value()) { + return true; + } + *listener << instruction->ToString() + << " has incorrect sharding (expected: " << sharding_->ToString() + << ")"; + return false; + } else { + *listener << instruction->ToString() + << " has no sharding (expected: " << sharding_->ToString() << ")"; + return false; + } +} + +void HloShardingMatcher::DescribeTo(std::ostream* os) const { + if (sharding_.has_value()) { + *os << sharding_->ToString(); + } else { + *os << ""; + } +} + } // namespace testing void PrintTo(const HloInstruction* inst, ::std::ostream* os) { diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index 103f04a2cb7..f2ab9b5d9b6 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/test.h" +#include "tensorflow/core/lib/gtl/optional.h" namespace xla { namespace testing { @@ -86,6 +87,50 @@ class HloCustomCallMatcher : public HloMatcher { ::testing::Matcher call_target_matcher_; }; +class HloShapeMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +class HloShapeAndLayoutMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeAndLayoutMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +// Verify the sharding of an instruction against the provided HloSharding. If a +// nullopt is provided for the expected sharding then it checks that no sharding +// is present for an instruction. +class HloShardingMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShardingMatcher( + const tensorflow::gtl::optional& sharding) + : sharding_(sharding) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + tensorflow::gtl::optional sharding_; +}; + // HloInstruction* matchers for opcode and operands. Example: // namespace op = xla::opcode_matchers; // EXPECT_THAT(instruction, @@ -231,6 +276,30 @@ inline ::testing::Matcher CustomCall() { new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {})); } +// Verifies the shape or the shape and the layout of an HLO instruction against +// the provided shape object. +inline ::testing::Matcher Shape( + const class Shape& shape) { + return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape)); +} +inline ::testing::Matcher ShapeWithLayout( + const class Shape& shape) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShapeAndLayoutMatcher(shape)); +} + +// Verifies the value of the HloSharing against the provided sharding object. +inline ::testing::Matcher Sharding( + const HloSharding& sharding) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(sharding)); +} +// Verifies that no HloSharding is set for an HLO instruction. +inline ::testing::Matcher NoSharding() { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt)); +} + #undef HLO_MATCHER } // namespace opcode_matchers diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 1c21703a45e..c6373b2e46a 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -100,5 +100,63 @@ TEST(HloMatchersTest, CustomCallMatcher) { R"(custom-call with call target that is equal to "foo_target")"); } +TEST(HloMatchersTest, ShapeMatcher) { + auto p0 = HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param"); + + EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7}))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7})))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT(p0.get(), + op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::ShapeWithLayout( + ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0})))); + + EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5])"); + EXPECT_THAT( + Explain(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {7, 5}, {1, 0}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5]{1,0})"); +} + +TEST(HloMatchersTest, ShardingMatcher) { + auto p0 = HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {5}), + "param.0"); + p0->clear_sharding(); + auto p1 = HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {7}), + "param.1"); + p1->set_sharding(HloSharding::AssignDevice(1)); + + EXPECT_THAT(p0.get(), op::NoSharding()); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(1)))); + EXPECT_THAT(p1.get(), ::testing::Not(op::NoSharding())); + EXPECT_THAT(p1.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(0)))); + EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1))); + + EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))), + "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: " + "{maximal device=1})"); + EXPECT_THAT(Explain(p1.get(), op::NoSharding()), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "expected to have no sharding."); + EXPECT_THAT(Explain(p1.get(), op::Sharding(HloSharding::AssignDevice(0))), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "has incorrect sharding (expected: {maximal device=0})"); +} + } // namespace } // namespace xla From 1ce99cfa52b19a40cff8a9ae983a0a7f04eb2bf1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 07:38:49 -0700 Subject: [PATCH 0659/1734] Softens the requirements in the HLO sharding validation The goal is to support tiled shardings where the last N tile have no data. PiperOrigin-RevId: 194085302 --- .../compiler/xla/service/hlo_sharding.cc | 39 +++++++------------ .../compiler/xla/service/hlo_sharding_test.cc | 15 ++----- 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc index 1b42349b0b3..994de441237 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding.cc @@ -256,37 +256,24 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, ", input_shape=", ShapeUtil::HumanString(shape)); } - // The tile shape must not be the same as the input shape without maximal_ - // also set. If this is the case, we're not actually sharded and the correct - // constructor should have been used. - if (ShapeUtil::Equal(shape, tile_shape_)) { + // The correct constructor have to be used to create tile maximal shardings. + if (tile_assignment_.num_elements() == 1) { return tensorflow::errors::InvalidArgument( - "Tile shape is the same as the input shape. If a replicated sharding " - "was intended, use HloSharding::Replicated(). If a device placement " - "was intended, use HloSharding::AssignDevice()"); + "Tile assignment only contains a single device. If a replicated " + "sharding was intended, use HloSharding::Replicated(). If a device " + "placement was intended, use HloSharding::AssignDevice()"); } - // The tile shape must not be greater than the input shape in any dimension. - for (int64 i = 0, e = ShapeUtil::Rank(shape); i != e; ++i) { - auto tile_dim = tile_shape_.dimensions(i); - auto shape_dim = shape.dimensions(i); - if (tile_dim > shape_dim) { - return tensorflow::errors::InvalidArgument( - StrCat("Tile is larger than input shape (dimension ", i, ", ", - tile_dim, " > ", shape_dim)); - } - } - - // The tile assignment tensor must be exactly dimensioned to ceil(shape[dim] - // tile[dim]) for every dimension contained within tile. + // The tile assignment tensor must contain enough element to cover the full + // shape with tiles of the specified size. for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) { - int64 expected_dim = - CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i)); - if (tile_assignment_.dimensions()[i] != expected_dim) { + int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i); + if (shape.dimensions(i) > total_tile_size) { return tensorflow::errors::InvalidArgument( - StrCat("Tile assignment tensor has incorrect shape. Dimension ", i, - " expected ", expected_dim, " but got ", - tile_assignment_.dimensions()[i])); + StrCat("Tile assignment tensor has too few element to cover the full " + "shape. Dimension ", + i, ", shape ", shape.dimensions(i), ", total size ", + total_tile_size)); } } diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc index 69ea4233e45..3bf0d25efb7 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc @@ -88,7 +88,7 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should pass. + // Test should fail because of more devices used then `num_device`. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); @@ -97,17 +97,8 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should fail due to the tile being larger than the input space. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); - EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {2, 2}), - /*num_devices=*/4)); - } - - { - // Test should fail due to the tile not dividing the input space into 4 - // sections (even with padding). + // Test should fail because the total tiled size in dimension 0 is 4 but we + // have 6 elements along that dimensions. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); From 38b531ddfb1e2fd0afd765710e4416fd555b98ae Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 09:11:15 -0700 Subject: [PATCH 0660/1734] Internal Change PiperOrigin-RevId: 194096341 --- tensorflow/core/BUILD | 74 ++++++++++++++++--- .../core/platform/default/build_config.bzl | 49 +++++++++++- tensorflow/tensorflow.bzl | 33 +++++++-- tensorflow/tools/proto_text/BUILD | 7 +- .../proto_text/gen_proto_text_functions.cc | 6 +- 5 files changed, 146 insertions(+), 23 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ba1fd415655..843fd7b907d 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -161,7 +161,7 @@ exports_files(["ops/ops.pbtxt"]) # Note that some protos are in neither additional_core_proto_srcs nor this # filegroup; e.g. ones with individual proto_library targets. # LINT.IfChange -CORE_PROTO_SRCS = [ +COMMON_PROTO_SRCS = [ "example/example.proto", "example/feature.proto", "framework/allocation_description.proto", @@ -189,7 +189,6 @@ CORE_PROTO_SRCS = [ "framework/types.proto", "framework/variable.proto", "framework/versions.proto", - "lib/core/error_codes.proto", "protobuf/config.proto", "protobuf/cluster.proto", "protobuf/debug.proto", @@ -202,8 +201,14 @@ CORE_PROTO_SRCS = [ "util/memmapped_file_system.proto", "util/saved_tensor_slice.proto", ] + +ERROR_CODES_PROTO_SRCS = [ + "lib/core/error_codes.proto", +] # LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb) +CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS + # Protos which are not needed on mobile builds, but should be included in # protos_all. # @@ -224,12 +229,16 @@ ADDITIONAL_CORE_PROTO_SRCS = [ tf_proto_library( name = "protos_all", - srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + srcs = [], cc_api_version = 2, default_header = True, j2objc_api_version = 1, java_api_version = 2, js_api_version = 2, + protodeps = [ + ":protos_all_proto", + ":error_codes_proto", + ], visibility = ["//visibility:public"], ) @@ -1134,7 +1143,8 @@ filegroup( filegroup( name = "mobile_srcs_no_runtime", srcs = [ - ":proto_text_srcs_all", + ":protos_all_proto_text_srcs", + ":error_codes_proto_text_srcs", "//tensorflow/core/platform/default/build_config:android_srcs", ] + glob( [ @@ -1930,15 +1940,58 @@ cc_library( ], ) -proto_text_hdrs_and_srcs = tf_generate_proto_text_sources( - name = "proto_text_srcs_all", - srcs = CORE_PROTO_SRCS, +tf_proto_library( + name = "error_codes_proto", + srcs = ERROR_CODES_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, +) + +tf_generate_proto_text_sources( + name = "error_codes_proto_text", + srcs = ERROR_CODES_PROTO_SRCS, + protodeps = [], srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_cc", + ":lib_internal", + ], +) + +tf_proto_library( + name = "protos_all_proto", + srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, + protodeps = [ + ":error_codes_proto", + ], +) + +tf_generate_proto_text_sources( + name = "protos_all_proto_text", + srcs = COMMON_PROTO_SRCS, + protodeps = ERROR_CODES_PROTO_SRCS, + srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_text", + ":lib_internal", + ":protos_all_proto_cc", + ], ) cc_library( name = "proto_text", - hdrs = proto_text_hdrs_and_srcs.hdrs, + hdrs = [ + ":error_codes_proto_text_hdrs", + ":protos_all_proto_text_hdrs", + ], deps = [ ":lib", ":lib_internal", @@ -2083,7 +2136,7 @@ tf_cuda_library( "util/memmapped_file_system.cc", "util/memmapped_file_system_writer.cc", ], - }) + proto_text_hdrs_and_srcs.srcs + tf_additional_framework_srcs(), + }) + tf_additional_framework_srcs(), hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS, copts = tf_copts(), linkopts = select({ @@ -2097,7 +2150,8 @@ tf_cuda_library( deps = [ ":lib", ":lib_internal", - ":proto_text", + ":protos_all_proto_text", + ":error_codes_proto_text", ":protos_all_cc", ":version_lib", "//tensorflow/core/platform/default/build_config:platformlib", diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 44356e34383..ca0587e2777 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -319,10 +319,34 @@ def tf_proto_library_cc(name, srcs = [], has_services = None, use_grpc_plugin = None if cc_grpc_version: use_grpc_plugin = True + + cc_deps = tf_deps(protodeps, "_cc") + cc_name = name + "_cc" + if not srcs: + # This is a collection of sub-libraries. Build header-only and impl + # libraries containing all the sources. + proto_gen( + name = cc_name + "_genproto", + deps = [s + "_genproto" for s in cc_deps], + protoc = "@protobuf_archive//:protoc", + visibility=["//visibility:public"], + ) + native.cc_library( + name = cc_name, + deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] + + if_static([name + "_cc_impl"]), + ) + native.cc_library( + name = cc_name + "_impl", + deps = [s + "_impl" for s in cc_deps] + ["@protobuf_archive//:cc_wkt_protos"], + ) + + return + cc_proto_library( - name = name + "_cc", + name = cc_name, srcs = srcs, - deps = tf_deps(protodeps, "_cc") + ["@protobuf_archive//:cc_wkt_protos"], + deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"], cc_libs = cc_libs + if_static( ["@protobuf_archive//:protobuf"], ["@protobuf_archive//:protobuf_headers"] @@ -341,11 +365,28 @@ def tf_proto_library_cc(name, srcs = [], has_services = None, def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[], testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False): + py_deps = tf_deps(protodeps, "_py") + py_name = name + "_py" + if not srcs: + # This is a collection of sub-libraries. Build header-only and impl + # libraries containing all the sources. + proto_gen( + name = py_name + "_genproto", + deps = [s + "_genproto" for s in py_deps], + protoc = "@protobuf_archive//:protoc", + visibility=["//visibility:public"], + ) + native.py_library( + name = py_name, + deps = py_deps + ["@protobuf_archive//:protobuf_python"]) + + return + py_proto_library( - name = name + "_py", + name = py_name, srcs = srcs, srcs_version = srcs_version, - deps = deps + tf_deps(protodeps, "_py") + ["@protobuf_archive//:protobuf_python"], + deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"], protoc = "@protobuf_archive//:protoc", default_runtime = "@protobuf_archive//:protobuf_python", visibility = visibility, diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 51e856bed0e..a9ddd4fc606 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -37,20 +37,25 @@ def src_to_test_name(src): def full_path(relative_paths): return [native.package_name() + "/" + relative for relative in relative_paths] +def _add_tfcore_prefix(src): + if src.startswith("//"): + return src + return "//tensorflow/core:" + src + # List of proto files for android builds def tf_android_core_proto_sources(core_proto_sources_relative): return [ - "//tensorflow/core:" + p for p in core_proto_sources_relative + _add_tfcore_prefix(p) for p in core_proto_sources_relative ] # Returns the list of pb.h and proto.h headers that are generated for # tf_android_core_proto_sources(). def tf_android_core_proto_headers(core_proto_sources_relative): return ([ - "//tensorflow/core/" + p.replace(".proto", ".pb.h") + _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h") for p in core_proto_sources_relative ] + [ - "//tensorflow/core/" + p.replace(".proto", ".proto.h") + _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h") for p in core_proto_sources_relative ]) @@ -1672,22 +1677,36 @@ def cuda_py_tests(name, # # Return a struct with fields (hdrs, srcs) containing the names of the # generated files. -def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs): +def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps=[], deps=[], visibility=None): out_hdrs = ( [p.replace(".proto", ".pb_text.h") for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs]) out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs] native.genrule( - name=name, - srcs=srcs + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")], + name=name + "_srcs", + srcs=srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")], outs=out_hdrs + out_srcs, + visibility=visibility, cmd= "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " + "$(@D) " + srcs_relative_dir + " $(SRCS)", tools=[ clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions") ],) - return struct(hdrs=out_hdrs, srcs=out_srcs) + + native.filegroup( + name=name + "_hdrs", + srcs=out_hdrs, + visibility=visibility, + ) + + native.cc_library( + name=name, + srcs=out_srcs, + hdrs=out_hdrs, + visibility=visibility, + deps = deps, + ) def tf_genrule_cmd_append_to_srcs(to_append): return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append + diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD index ef7bfdd3c9e..31e8fb9120c 100644 --- a/tensorflow/tools/proto_text/BUILD +++ b/tensorflow/tools/proto_text/BUILD @@ -75,9 +75,14 @@ tf_proto_library_cc( ) tf_generate_proto_text_sources( - name = "test_proto_text_srcs", + name = "test_proto_text", srcs = ["test.proto"], srcs_relative_dir = "tensorflow/tools/proto_text/", + deps = [ + ":test_proto_cc", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + ], ) tf_cc_test( diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc index f0bb59acf80..234afe879bc 100644 --- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc +++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc @@ -130,7 +130,11 @@ int MainImpl(int argc, char** argv) { const string path = output_root + "/" + proto_path_no_suffix + suffix; FILE* f = fopen(path.c_str(), "w"); - if (f == nullptr) return -1; + if (f == nullptr) { + // We don't expect this output to be generated. It was specified in the + // list of sources solely to satisfy a proto import dependency. + continue; + } if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) { fclose(f); return -1; From b7f957ceedb6f47e4d68c506389bff210c35ef6a Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Tue, 24 Apr 2018 09:15:07 -0700 Subject: [PATCH 0661/1734] Add S64 clamp test. PiperOrigin-RevId: 194096814 --- .../compiler/xla/tests/vector_ops_simple_test.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc index 697d78fe6e9..8b86b5e760c 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc @@ -348,6 +348,17 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { ComputeAndCompareR1(&builder, expected, {}); } +XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) { + ComputationBuilder builder(client_, TestName()); + auto zero = builder.ConstantR0(0); + auto one = builder.ConstantR0(10); + auto x = builder.ConstantR1({-3, 3, 9, 13}); + auto clamp = builder.Clamp(zero, x, one); + + std::vector expected = {0, 3, 9, 10}; + ComputeAndCompareR1(&builder, expected, {}); +} + XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { Computation add_half; { From cfedd67f5881ae3697638e9b74eccb7da9818a0e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 09:44:52 -0700 Subject: [PATCH 0662/1734] Add an attr to apply_adagrad op that allows it to skip updating the accumulators. PiperOrigin-RevId: 194100678 --- tensorflow/core/kernels/training_ops.cc | 23 ++++++++++++++----- tensorflow/core/kernels/training_ops.h | 2 +- .../core/kernels/training_ops_gpu.cu.cc | 6 +++-- tensorflow/core/ops/training_ops.cc | 4 ++++ 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 5b13b109375..271329599fa 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -153,8 +153,10 @@ struct ApplyAdagrad { void operator()(const CPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, - typename TTypes::ConstFlat grad) { - accum.device(d) += grad.square(); + typename TTypes::ConstFlat grad, bool update_slots) { + if (update_slots) { + accum.device(d) += grad.square(); + } var.device(d) -= grad * lr() * accum.rsqrt(); } }; @@ -1074,6 +1076,7 @@ class ApplyAdagradOp : public OpKernel { public: explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_)); } void Compute(OpKernelContext* ctx) override { @@ -1111,13 +1114,15 @@ class ApplyAdagradOp : public OpKernel { const Device& device = ctx->template eigen_device(); functor::ApplyAdagrad()(device, var.flat(), accum.flat(), - lr.scalar(), grad.flat()); + lr.scalar(), grad.flat(), + update_slots_); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; + bool update_slots_; }; #define REGISTER_KERNELS(D, T) \ @@ -1145,7 +1150,7 @@ namespace functor { void ApplyAdagrad::operator()( \ const GPUDevice& d, typename TTypes::Flat var, \ typename TTypes::Flat accum, typename TTypes::ConstScalar lr, \ - typename TTypes::ConstFlat grad); \ + typename TTypes::ConstFlat grad, bool update_slots); \ extern template struct ApplyAdagrad; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); @@ -1266,6 +1271,7 @@ class SparseApplyAdagradOp : public OpKernel { public: explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_)); } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { @@ -1339,7 +1345,9 @@ class SparseApplyAdagradOp : public OpKernel { auto a = accum_flat.template chip<0>(index); auto g = grad_flat.template chip<0>(i); auto v = var_flat.template chip<0>(index); - a += g.square(); + if (update_slots_) { + a += g.square(); + } v -= g.constant(lr_scalar) * g * a.rsqrt(); } } else { @@ -1358,7 +1366,9 @@ class SparseApplyAdagradOp : public OpKernel { " in indices is out of range"))); T& a = accum_flat(index); const T& g = grad_flat(i); - a += g * g; + if (update_slots_) { + a += g * g; + } var_flat(index) -= lr_scalar * g / Eigen::numext::sqrt(a); } } @@ -1369,6 +1379,7 @@ class SparseApplyAdagradOp : public OpKernel { private: bool use_exclusive_lock_; + bool update_slots_; }; #define REGISTER_KERNELS(T, Tindices) \ diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index f536a61eb06..495a94f1a1b 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -68,7 +68,7 @@ struct ApplyAdagrad { void operator()(const Device& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, - typename TTypes::ConstFlat grad); + typename TTypes::ConstFlat grad, bool update_slots); }; template diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index 2aa17f2a0f3..4bd32592db1 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -42,8 +42,10 @@ struct ApplyAdagrad { void operator()(const GPUDevice& d, typename TTypes::Flat var, typename TTypes::Flat accum, typename TTypes::ConstScalar lr, - typename TTypes::ConstFlat grad) { - accum.device(d) += grad.square(); + typename TTypes::ConstFlat grad, bool update_slots) { + if (update_slots) { + accum.device(d) += grad.square(); + } Eigen::array::Tensor::Index, 1> bcast; bcast[0] = grad.dimension(0); Eigen::Sizes<1> single; diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index dc7b588898c..94ff092a85d 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -253,6 +253,7 @@ REGISTER_OP("ApplyAdagrad") .Output("out: Ref(T)") .Attr("T: numbertype") .Attr("use_locking: bool = false") + .Attr("update_slots: bool = true") .SetShapeFn([](InferenceContext* c) { return ApplyAdagradShapeFn(c, false /* sparse */); }); @@ -264,6 +265,7 @@ REGISTER_OP("ResourceApplyAdagrad") .Input("grad: T") .Attr("T: numbertype") .Attr("use_locking: bool = false") + .Attr("update_slots: bool = true") .SetShapeFn([](InferenceContext* c) { return ApplyAdagradShapeFn(c, false /* sparse */); }); @@ -320,6 +322,7 @@ REGISTER_OP("SparseApplyAdagrad") .Attr("T: numbertype") .Attr("Tindices: {int32, int64}") .Attr("use_locking: bool = false") + .Attr("update_slots: bool = true") .SetShapeFn([](InferenceContext* c) { return ApplyAdagradShapeFn(c, true /* sparse */); }); @@ -333,6 +336,7 @@ REGISTER_OP("ResourceSparseApplyAdagrad") .Attr("T: numbertype") .Attr("Tindices: {int32, int64}") .Attr("use_locking: bool = false") + .Attr("update_slots: bool = true") .SetShapeFn([](InferenceContext* c) { return ApplyAdagradShapeFn(c, true /* sparse */); }); From 9c7e819352581bf5a97509b1fa5dc71dffa26500 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 10:24:26 -0700 Subject: [PATCH 0663/1734] Enable all arithmetic optimizations by default. PiperOrigin-RevId: 194106835 --- .../core/grappler/optimizers/arithmetic_optimizer.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h index c0fe8839ca7..344c8281eb1 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h @@ -57,9 +57,9 @@ class ArithmeticOptimizer : public GraphOptimizer { // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests. // Remove when all optimizers will be migrated to separate stages. bool enable_try_simplify_and_replace = true; - bool combine_add_to_addn = false; + bool combine_add_to_addn = true; bool hoist_common_factor_out_of_aggregation = true; - bool minimize_broadcasts = false; + bool minimize_broadcasts = true; bool remove_identity_transpose = true; bool remove_redundant_bitcast = true; bool remove_redundant_cast = true; @@ -70,11 +70,6 @@ class ArithmeticOptimizer : public GraphOptimizer { static ArithmeticOptimizerOptions Default( RewriterConfig::Toggle opt_level) { ArithmeticOptimizerOptions options; - // TODO(ezhulenev): enable by default after 1.8 release cut - if (opt_level == RewriterConfig::AGGRESSIVE) { - options.combine_add_to_addn = true; - options.minimize_broadcasts = true; - } return options; } }; From 55a4a479df8e1fbc8aa726596e6d4591364b3585 Mon Sep 17 00:00:00 2001 From: Sherry Moore Date: Tue, 24 Apr 2018 10:31:17 -0700 Subject: [PATCH 0664/1734] Added a call in CheckpointSaverHook.after_create_session to always save checkpoint before the first training step. PiperOrigin-RevId: 194107958 --- .../python/learn/estimators/estimator_test.py | 4 +- tensorflow/python/estimator/estimator_test.py | 4 +- .../training/basic_session_run_hooks.py | 36 ++++++++++-------- .../training/basic_session_run_hooks_test.py | 38 ++++++++++++++++--- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py index d81a534b79b..9e5aaf3118d 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py @@ -715,7 +715,9 @@ class EstimatorTest(test.TestCase): ckpt = checkpoint_state_pb2.CheckpointState() text_format.Merge(checkpoint_file_content, ckpt) self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5') - self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'], + # TODO(b/78461127): Please modify tests to not directly rely on names of + # checkpoints. + self.assertAllEqual(['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths) def test_train_save_copy_reload(self): diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py index d453e19357a..0fea86124cc 100644 --- a/tensorflow/python/estimator/estimator_test.py +++ b/tensorflow/python/estimator/estimator_test.py @@ -679,8 +679,10 @@ class EstimatorTrainTest(test.TestCase): ckpt = checkpoint_state_pb2.CheckpointState() text_format.Merge(checkpoint_file_content, ckpt) self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5') + # TODO(b/78461127): Please modify tests to not directly rely on names of + # checkpoints. self.assertAllEqual( - ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths) + ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths) def test_train_save_copy_reload(self): tmpdir = tempfile.mkdtemp() diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py index 3651291bdfc..47339e057fb 100644 --- a/tensorflow/python/training/basic_session_run_hooks.py +++ b/tensorflow/python/training/basic_session_run_hooks.py @@ -434,23 +434,27 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): for l in self._listeners: l.begin() - def before_run(self, run_context): # pylint: disable=unused-argument - if self._timer.last_triggered_step() is None: - # We do write graph and saver_def at the first call of before_run. - # We cannot do this in begin, since we let other hooks to change graph and - # add variables in begin. Graph is finalized after all begin calls. - training_util.write_graph( - ops.get_default_graph().as_graph_def(add_shapes=True), - self._checkpoint_dir, - "graph.pbtxt") - saver_def = self._get_saver().saver_def if self._get_saver() else None - graph = ops.get_default_graph() - meta_graph_def = meta_graph.create_meta_graph_def( - graph_def=graph.as_graph_def(add_shapes=True), - saver_def=saver_def) - self._summary_writer.add_graph(graph) - self._summary_writer.add_meta_graph(meta_graph_def) + def after_create_session(self, session, coord): + global_step = session.run(self._global_step_tensor) + # We do write graph and saver_def at the first call of before_run. + # We cannot do this in begin, since we let other hooks to change graph and + # add variables in begin. Graph is finalized after all begin calls. + training_util.write_graph( + ops.get_default_graph().as_graph_def(add_shapes=True), + self._checkpoint_dir, + "graph.pbtxt") + saver_def = self._get_saver().saver_def if self._get_saver() else None + graph = ops.get_default_graph() + meta_graph_def = meta_graph.create_meta_graph_def( + graph_def=graph.as_graph_def(add_shapes=True), + saver_def=saver_def) + self._summary_writer.add_graph(graph) + self._summary_writer.add_meta_graph(meta_graph_def) + # The checkpoint saved here is the state at step "global_step". + self._save(session, global_step) + self._timer.update_last_triggered_step(global_step) + def before_run(self, run_context): # pylint: disable=unused-argument return SessionRunArgs(self._global_step_tensor) def after_run(self, run_context, run_values): diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py index 25962f6bf7a..31898562f81 100644 --- a/tensorflow/python/training/basic_session_run_hooks_test.py +++ b/tensorflow/python/training/basic_session_run_hooks_test.py @@ -466,8 +466,8 @@ class CheckpointSaverHookTest(test.TestCase): self.assertEqual(2, global_step_val) self.assertEqual({ 'begin': 1, - 'before_save': 2, - 'after_save': 2, + 'before_save': 3, + 'after_save': 3, 'end': 1 }, listener_counts) @@ -490,8 +490,8 @@ class CheckpointSaverHookTest(test.TestCase): self.assertEqual(2, global_step_val) self.assertEqual({ 'begin': 1, - 'before_save': 2, - 'after_save': 2, + 'before_save': 3, + 'after_save': 3, 'end': 1 }, listener_counts) @@ -523,8 +523,8 @@ class CheckpointSaverHookTest(test.TestCase): self.assertEqual(2, global_step_val) self.assertEqual({ 'begin': 1, - 'before_save': 2, - 'after_save': 2, + 'before_save': 3, + 'after_save': 3, 'end': 1 }, listener1_counts) self.assertEqual(listener1_counts, listener2_counts) @@ -706,6 +706,7 @@ class CheckpointSaverHookTest(test.TestCase): with session_lib.Session() as sess: sess.run(self.scaffold.init_op) mon_sess = monitored_session._HookedSession(sess, [hook]) + hook.after_create_session(sess, None) mon_sess.run(self.train_op) summary_writer.assert_summaries( test_case=self, @@ -718,6 +719,31 @@ class CheckpointSaverHookTest(test.TestCase): fake_summary_writer.FakeSummaryWriter.uninstall() + def test_save_checkpoint_before_first_train_step(self): + with self.graph.as_default(): + hook = basic_session_run_hooks.CheckpointSaverHook( + self.model_dir, save_steps=2, scaffold=self.scaffold) + hook.begin() + self.scaffold.finalize() + with session_lib.Session() as sess: + mon_sess = monitored_session._HookedSession(sess, [hook]) + sess.run(self.scaffold.init_op) + hook.after_create_session(sess, None) + # Verifies that checkpoint is saved at step 0. + self.assertEqual(0, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + # Verifies that no checkpoint is saved after one training step. + mon_sess.run(self.train_op) + self.assertEqual(0, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + # Verifies that checkpoint is saved after save_steps. + mon_sess.run(self.train_op) + self.assertEqual(2, + checkpoint_utils.load_variable(self.model_dir, + self.global_step.name)) + class CheckpointSaverHookMultiStepTest(test.TestCase): From f6ae3d54b0700ba76b56ebe3c702440f39460d2e Mon Sep 17 00:00:00 2001 From: Guangda Lai Date: Tue, 24 Apr 2018 10:51:08 -0700 Subject: [PATCH 0665/1734] Split gpu_id library to a header library and an implementation, so when if_static is false and we're building shared objects that depend on gpu_id, the implementation won't get linked. PiperOrigin-RevId: 194111330 --- tensorflow/core/BUILD | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 843fd7b907d..bda87c6aed2 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2563,6 +2563,19 @@ tf_cuda_library( cc_library( name = "gpu_id", + hdrs = [ + "common_runtime/gpu/gpu_id.h", + "common_runtime/gpu/gpu_id_manager.h", + ], + deps = [ + ":lib", + ] + if_static([ + ":gpu_id_impl", + ]), +) + +cc_library( + name = "gpu_id_impl", srcs = ["common_runtime/gpu/gpu_id_manager.cc"], hdrs = [ "common_runtime/gpu/gpu_id.h", @@ -2612,7 +2625,7 @@ tf_cuda_library( ":core_cpu_lib", ":framework", ":framework_internal", - ":gpu_id", + ":gpu_id_impl", ":gpu_init_impl", ":gpu_lib", ":graph", From 09398096284995d8a93c124bdbd70d6e1a44fbc3 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 24 Apr 2018 10:59:10 -0700 Subject: [PATCH 0666/1734] Update README.md --- tensorflow/tools/docker/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md index f46c56e11aa..525f2995cee 100644 --- a/tensorflow/tools/docker/README.md +++ b/tensorflow/tools/docker/README.md @@ -16,12 +16,12 @@ quick links here: We currently maintain two Docker container images: -* `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only! +* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only! -* `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies +* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies and support for NVidia CUDA -Note: We also publish the same containers into +Note: We store all our containers on [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/). @@ -29,12 +29,12 @@ Note: We also publish the same containers into Run non-GPU container using - $ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow + $ docker run -it -p 8888:8888 tensorflow/tensorflow For GPU support install NVidia drivers (ideally latest) and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using - $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu + $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu Note: If you would have a problem running nvidia-docker you may try the old method @@ -44,7 +44,7 @@ it there and try using nvidia-docker as described above. $ # The old, not recommended way to run docker with gpu support: $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}') $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu + $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu ## More containers From b7b7ec32b848d6f5a7cf432fb44ceed4c9587078 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 24 Apr 2018 10:57:00 -0700 Subject: [PATCH 0667/1734] Add note that setting LD_LIBRARY_PATH after having already kicked off a build requires a clean rebuild. PiperOrigin-RevId: 194112367 --- tensorflow/docs_src/install/install_sources.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index b1867586530..71f066e4cb2 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -393,9 +393,9 @@ If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Star If the system outputs an error message instead of a greeting, see [Common installation problems](#common_installation_problems). -## Common installation problems +## Common build and installation problems -The installation problems you encounter typically depend on the +The build and installation problems you encounter typically depend on the operating system. See the "Common installation problems" section of one of the following guides: @@ -448,6 +448,11 @@ Stack Overflow and specify the `tensorflow` tag. + + + + +
Stack Overflow Link Error Message
Link to GitHub or Stack Overflow Error Message
36159194
47080760
undefined reference to `cublasGemmEx@libcublas.so.9.0'
## Tested source configurations From 052c53c27956251e4b4952cd862596a9c08584e4 Mon Sep 17 00:00:00 2001 From: Billy Lamberta Date: Tue, 24 Apr 2018 11:09:09 -0700 Subject: [PATCH 0668/1734] Review fixes to install_linux --- tensorflow/docs_src/install/install_linux.md | 123 +++++++++++++------ 1 file changed, 86 insertions(+), 37 deletions(-) diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index 9b431e49eeb..fa82ac9c40a 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -116,23 +116,47 @@ There are a few options to install TensorFlow on your machine: ### Use `pip` in a virtual environment -This is the *recommended* install method. The -[Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual Python -environments that are isolated from other Python development on the same machine. -In this scenario, you install TensorFlow and its dependencies within a virtual -environment that is available when *activated*. Virtualenv provides a reliable -way to install and run TensorFlow while avoiding conflicts with the rest of the -system. +Key Point: Using a virtual environment is the recommended install method. -1\. On Ubuntu, install the `pip` and `virtualenv` packages: +The [Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual +Python environments that are isolated from other Python development on the same +machine. In this scenario, you install TensorFlow and its dependencies within a +virtual environment that is available when *activated*. Virtualenv provides a +reliable way to install and run TensorFlow while avoiding conflicts with the rest +of the system. + +##### 1. Install Python, `pip`, and `virtualenv`. + +On Ubuntu, Python is automatically installed and `pip` is *usually* installed. +Confirm the `python` and `pip` versions: + +
+  python -V
+  pip -V  # or: pip3 -V
+
+ +To install these packages on Ubuntu:
   sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7
   sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n
 
-2\. Create a directory for the virtual environment and choose a Python -interpreter: +We *recommend* using `pip` version 8.1 or higher. If using a release before +version 8.1, upgrade `pip`: + +
+  sudo pip install -U pip
+
+ +If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is +installed, use `easy_install` to install `pip`: + +
+  easy_install -U pip
+
+ +##### 2. Create a directory for the virtual environment and choose a Python interpreter.
   mkdir ~/tensorflow  # somewhere to work out of
@@ -142,7 +166,9 @@ interpreter:
   virtualenv --system-site-packages -p python3 venv # Use Python 3.n
 
-3\. Activate the Virtualenv environment using one of these shell commands: +##### 3. Activate the Virtualenv environment. + +Use one of these shell-specific commands to activate the virtual environment:
   source ~/tensorflow/venv/bin/activate      # bash, sh, ksh, or zsh
@@ -152,26 +178,32 @@ interpreter:
 
 When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
 
-4\. Upgrade `pip` in your virtual environment:
+##### 4. Upgrade `pip` in the virtual environment.
 
-See the [pip installation guide](https://pip.pypa.io/en/stable/installing/) for
-instructions, or use `easy_install`:
+Within the active virtual environment, upgrade `pip`:
 
 
-(venv)$ easy_install -U pip
+(venv)$ pip install -U pip
 
-5\. Within an active Virtualenv environment, use one of the following `pip` -commands to install the TensorFlow package: +You can install other Python packages within the virtual environment without +affecting packages outside the `virtualenv`. + +##### 5. Install TensorFlow in the virtual environment. + +Choose one of the available TensorFlow packages for installation: + +* `tensorflow` —Current release for CPU +* `tensorflow-gpu` —Current release with GPU support +* `tf-nightly` —Nightly build for CPU +* `tf-nightly-gpu` —Nightly build with GPU support + +Within an active Virtualenv environment, use `pip` to install the package:
-(venv)$ pip install --upgrade tensorflow      # for Python 2.7
-(venv)$ pip3 install --upgrade tensorflow     # for Python 3.n
-(venv)$ pip install --upgrade tensorflow-gpu  # for Python 2.7 and GPU
+  pip install -U tensorflow
 
-Success! TensorFlow is now installed. - Use `pip list` to show the packages installed in the virtual environment. [Validate the install](#ValidateYourInstallation) and test the version: @@ -179,6 +211,8 @@ Use `pip list` to show the packages installed in the virtual environment. (venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
+Success: TensorFlow is now installed. + Use the `deactivate` command to stop the Python virtual environment. #### Problems @@ -222,10 +256,9 @@ environment, a system `pip` install is straightforward. See the [REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py) -for a list of TensorFlow packages that `pip` installs or upgrade`. +for a list of packages that TensorFlow installs. - -#### Install Python and `pip` +##### 1. Install Python, `pip`, and `virtualenv`. On Ubuntu, Python is automatically installed and `pip` is *usually* installed. Confirm the `python` and `pip` versions: @@ -235,28 +268,42 @@ Confirm the `python` and `pip` versions: pip -V # or: pip3 -V -We *strongly recommend* `pip` or `pip3` version 8.1 or higher. If using a release -before version 8.1, upgrade `pip`: +To install these packages on Ubuntu:
   sudo apt-get install python-pip python-dev   # for Python 2.7
   sudo apt-get install python3-pip python3-dev # for Python 3.n
 
- -#### Install TensorFlow - -Install one of the available TensorFlow packages: +We *recommend* using `pip` version 8.1 or higher. If using a release before +version 8.1, upgrade `pip`:
-  # Select one:
-  sudo pip install tensorflow      # Python 2.7 CPU (no GPU support)
-  sudo pip3 install tensorflow     # Python 3.n CPU (no GPU support)
-  sudo pip install tensorflow-gpu  # Python 2.7 GPU support
-  sudo pip3 install tensorflow-gpu # Python 3.n GPU support
+  sudo pip install -U pip
 
-Success! TensorFlow is now installed. +If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is +installed, use `easy_install` to install `pip`: + +
+  easy_install -U pip
+
+ +##### 2. Install TensorFlow on system. + +Choose one of the available TensorFlow packages for installation: + +* `tensorflow` —Current release for CPU +* `tensorflow-gpu` —Current release with GPU support +* `tf-nightly` —Nightly build for CPU +* `tf-nightly-gpu` —Nightly build with GPU support + +And use `pip` to install the package for Python 2 or 3: + +
+  sudo pip install -U tensorflow   # Python 2.7
+  sudo pip3 install -U tensorflow  # Python 3.n
+
Use `pip list` to show the packages installed on the system. [Validate the install](#ValidateYourInstallation) and test the version: @@ -265,6 +312,8 @@ Use `pip list` to show the packages installed on the system. python -c "import tensorflow as tf; print(tf.__version__)" +Success: TensorFlow is now installed. + #### Problems If the above steps failed, try installing the TensorFlow binary using the remote From aeaec69869f13fc37c3ed28881741dd344e6a150 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 11:18:47 -0700 Subject: [PATCH 0669/1734] Update ops-related pbtxt files. PiperOrigin-RevId: 194116315 --- .../core/ops/compat/ops_history.v1.pbtxt | 276 ++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 28 ++ 2 files changed, 304 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 05dee30ca07..701897f162f 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -2121,6 +2121,71 @@ op { } } } +op { + name: "ApplyAdagrad" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } +} op { name: "ApplyAdagradDA" input_arg { @@ -43524,6 +43589,65 @@ op { } is_stateful: true } +op { + name: "ResourceApplyAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } + is_stateful: true +} op { name: "ResourceApplyAdagradDA" input_arg { @@ -47876,6 +48000,79 @@ op { } is_stateful: true } +op { + name: "ResourceSparseApplyAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } + is_stateful: true +} op { name: "ResourceSparseApplyAdagradDA" input_arg { @@ -58622,6 +58819,85 @@ op { } } } +op { + name: "SparseApplyAdagrad" + input_arg { + name: "var" + type_attr: "T" + is_ref: true + } + input_arg { + name: "accum" + type_attr: "T" + is_ref: true + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + output_arg { + name: "out" + type_attr: "T" + is_ref: true + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT32 + type: DT_UINT8 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_INT64 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_BFLOAT16 + type: DT_UINT16 + type: DT_COMPLEX128 + type: DT_HALF + type: DT_UINT32 + type: DT_UINT64 + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } +} op { name: "SparseApplyAdagradDA" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 2edd15c446b..eb43c6fdfb5 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -891,6 +891,13 @@ op { b: false } } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } } op { name: "ApplyAdagradDA" @@ -21784,6 +21791,13 @@ op { b: false } } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } is_stateful: true } op { @@ -23150,6 +23164,13 @@ op { b: false } } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } is_stateful: true } op { @@ -27187,6 +27208,13 @@ op { b: false } } + attr { + name: "update_slots" + type: "bool" + default_value { + b: true + } + } } op { name: "SparseApplyAdagradDA" From 4a82acf286df1bc10581d91e13e0ab17458e83b4 Mon Sep 17 00:00:00 2001 From: Raghuraman Krishnamoorthi Date: Tue, 24 Apr 2018 11:20:04 -0700 Subject: [PATCH 0670/1734] Improve handling of scopes in folding unfused batch norms. This change allows folding to work for MobilenetV2 with unfused batch norms PiperOrigin-RevId: 194116535 --- .../quantize/python/fold_batch_norms.py | 22 +++++- .../quantize/python/fold_batch_norms_test.py | 79 +++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py index aa0ef643088..6f41722748b 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py @@ -501,8 +501,27 @@ def _GetBatchNormParams(graph, context, has_scaling): bn_decay_var_tensor = None split_context = context.split('/') - base_context = split_context[-1] + # Matching variable names is brittle and relies on scoping + # conventions. Fused batch norm folding is more robust. Support for unfused + # batch norms will be deprecated as we move forward. Fused batch norms allow + # for faster training and should be used whenever possible. + # context contains part of the names of the tensors we are interested in: + # For MobilenetV1, the context has repetitions: + # MobilenetV1/MobilenetV1/Conv2d_3_depthwise + # when the moving_mean tensor has the name: + # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read + # To pick the correct variable name, it is necessary to ignore the repeating + # header. + # For MobilenetV2, this problem does not exist: + # The context is: MobilenetV2/expanded_conv_3/depthwise + # and the names of the tensors start with a single MobilenetV2 + # The moving mean for example, has the name: + # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read + # We ignore the first string (MobilenetV1 or MobilenetV2) + # in the context to match correctly in both cases + + base_context = '/'.join(split_context[1:]) oplist = graph.get_operations() op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze' op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1' @@ -520,7 +539,6 @@ def _GetBatchNormParams(graph, context, has_scaling): op_suffix_gamma = base_context + '/BatchNorm/gamma' op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read' op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read' - # Parse through list of ops to find relevant ops for op in oplist: if op.name.endswith(op_suffix_mean): diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py index af31467476b..64e8142e7c6 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py @@ -134,6 +134,85 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): def testFoldConv2d(self): self._RunTestOverParameters(self._TestFoldConv2d) + def testMultipleLayerConv2d(self, + relu=nn_ops.relu, + relu_op_name='Relu', + has_scaling=True, + fused_batch_norm=False, + freeze_batch_norm_delay=None): + """Tests folding cases for a network with multiple layers. + + Args: + relu: Callable that returns an Operation, a factory method for the Relu*. + relu_op_name: String, name of the Relu* operation. + has_scaling: Bool, when true the batch norm has scaling. + fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance + """ + g = ops.Graph() + with g.as_default(): + batch_size, height, width = 5, 128, 128 + inputs = array_ops.zeros((batch_size, height, width, 3)) + out_depth = 3 + stride = 1 + activation_fn = relu + scope = 'network/expanded_conv_1/conv' + layer1 = conv2d( + inputs, + out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + # Add another layer + scope = 'network/expanded_conv_2/conv' + + _ = conv2d( + layer1, + 2 * out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) + folded_mul = g.get_operation_by_name(scope + '/mul_fold') + self.assertEqual(folded_mul.type, 'Mul') + self._AssertInputOpsAre(folded_mul, [ + scope + '/correction_mult', + self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) + ]) + self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold']) + + folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold') + self.assertEqual(folded_conv.type, 'Conv2D') + # Remove :0 at end of name for tensor prior to comparison + self._AssertInputOpsAre(folded_conv, + [scope + '/mul_fold', layer1.name[:-2]]) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) + + folded_add = g.get_operation_by_name(scope + '/add_fold') + self.assertEqual(folded_add.type, 'Add') + self._AssertInputOpsAre(folded_add, [ + scope + '/correction_add', + self._BathNormBiasName(scope, fused_batch_norm) + ]) + output_op_names = [scope + '/' + relu_op_name] + self._AssertOutputGoesToOps(folded_add, g, output_op_names) + + for op in g.get_operations(): + self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) + def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm, freeze_batch_norm_delay): From 9d2972e6ceb4911458e867d75466e14a31fa1773 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 11:22:49 -0700 Subject: [PATCH 0671/1734] show breakdown of execution cost with compute and memory cost for op summarization PiperOrigin-RevId: 194117030 --- .../core/grappler/costs/virtual_scheduler.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc index 0e5c654acfa..7f682729507 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc @@ -850,14 +850,16 @@ Costs VirtualScheduler::Summary() const { VLOG(1) << "Expected max per-op streaming buffers: " << graph_costs_.max_per_op_streaming; - VLOG(1) << "Per-op execution time:"; + VLOG(1) << "Per-op execution time / compute time / memory time:"; for (const auto& op_cost_pair : op_to_cost_) { const auto& op = op_cost_pair.first; const auto& cost = op_cost_pair.second.execution_time.count(); + const auto& compute_cost = op_cost_pair.second.compute_time.count(); + const auto& memory_cost = op_cost_pair.second.memory_time.count(); const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate; if (cost) { // Skip printing out zero-cost ops. VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~") - << cost; + << cost << " / " << compute_cost << " / " << memory_cost; } } @@ -898,7 +900,8 @@ Costs VirtualScheduler::Summary() const { << ", at the end: " << strings::HumanReadableNumBytes(state.memory_usage); - VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):"; + VLOG(1) << "Per-op execution time compute time / memory time " + "(and memory usage at peak memory usage):"; // Profile non-persistent op memory usage. for (const auto& node_port : state.mem_usage_snapshot_at_peak) { @@ -912,6 +915,8 @@ Costs VirtualScheduler::Summary() const { for (const auto& op_cost_pair : state.op_to_cost) { const auto& op = op_cost_pair.first; const auto& cost = op_cost_pair.second.execution_time.count(); + const auto& compute_cost = op_cost_pair.second.compute_time.count(); + const auto& memory_cost = op_cost_pair.second.memory_time.count(); total_compute_time_ns += op_cost_pair.second.execution_time; const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate; if (!is_op_cost_accurate) { @@ -930,8 +935,9 @@ Costs VirtualScheduler::Summary() const { if (cost || mem_usage_percent > 1.0) { // Print out only non-zero cost ops or ops with > 1% memory usage. VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~") - << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage) - << " [" << mem_usage_percent << "%] " + << cost << " / " << compute_cost << " / " << memory_cost << " (" + << strings::HumanReadableNumBytes(op_mem_usage) << " [" + << mem_usage_percent << "%] " << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")"); } } From d9cca05cbc5a4a7aeade2634e59fbf779965e3a0 Mon Sep 17 00:00:00 2001 From: Shashi Shekhar Date: Tue, 24 Apr 2018 11:24:37 -0700 Subject: [PATCH 0672/1734] Fix typo in event field name. PiperOrigin-RevId: 194117352 --- tensorflow/contrib/lite/profiling/profile_buffer.h | 10 +++++----- .../contrib/lite/profiling/profile_buffer_test.cc | 4 ++-- tensorflow/contrib/lite/profiling/profiler_test.cc | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h index 3bfe02571ba..b2f565376c3 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer.h +++ b/tensorflow/contrib/lite/profiling/profile_buffer.h @@ -37,9 +37,9 @@ struct ProfileEvent { // Label of the event. This usually describes the event. const char* tag; // Timestamp in microseconds when the event began. - int64_t begin_timestamp_ms; + int64_t begin_timestamp_us; // Timestamp in microseconds when the event ended. - int64_t end_timestamp_ms; + int64_t end_timestamp_us; // The field containing the type of event. This must be one of the event types // in EventType. EventType event_type; @@ -79,8 +79,8 @@ class ProfileBuffer { event_buffer_[index].tag = tag; event_buffer_[index].event_type = event_type; event_buffer_[index].event_metadata = event_metadata; - event_buffer_[index].begin_timestamp_ms = timestamp; - event_buffer_[index].end_timestamp_ms = 0; + event_buffer_[index].begin_timestamp_us = timestamp; + event_buffer_[index].end_timestamp_us = 0; current_index_++; return index; } @@ -103,7 +103,7 @@ class ProfileBuffer { } int event_index = event_handle % max_size; - event_buffer_[event_index].end_timestamp_ms = NowMicros(); + event_buffer_[event_index].end_timestamp_us = NowMicros(); } // Returns the size of the buffer. diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc index 0c5f0cd3149..b8784cca455 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc +++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc @@ -49,13 +49,13 @@ TEST(ProfileBufferTest, AddEvent) { auto event = GetProfileEvents(buffer)[0]; EXPECT_EQ(event->tag, "hello"); - EXPECT_GT(event->begin_timestamp_ms, 0); + EXPECT_GT(event->begin_timestamp_us, 0); EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT); EXPECT_EQ(event->event_metadata, 42); buffer.EndEvent(event_handle); EXPECT_EQ(1, buffer.Size()); - EXPECT_GE(event->end_timestamp_ms, event->begin_timestamp_ms); + EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us); } TEST(ProfileBufferTest, OverFlow) { diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc index 994523a8fb7..7914f36a319 100644 --- a/tensorflow/contrib/lite/profiling/profiler_test.cc +++ b/tensorflow/contrib/lite/profiling/profiler_test.cc @@ -30,7 +30,7 @@ namespace { void AssertDurationOfEventAroundMs(const ProfileEvent* event, double expected_ms, double eps_ms) { double duration_ms = - (event->end_timestamp_ms - event->begin_timestamp_ms) / 1e3; + (event->end_timestamp_us - event->begin_timestamp_us) / 1e3; EXPECT_NEAR(expected_ms, duration_ms, eps_ms); } From ff013946362e7d80c53b82b64a7f5b462808ff8f Mon Sep 17 00:00:00 2001 From: Malcolm Reynolds Date: Tue, 24 Apr 2018 11:26:26 -0700 Subject: [PATCH 0673/1734] Clarify error message when importing a GraphDef with unknown ops. This should make the situation from github.com/tensorflow/tensorflow/issues/17014 less confusing. PiperOrigin-RevId: 194117660 --- tensorflow/python/framework/importer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py index 3f8a8c4befb..5112bea48b5 100644 --- a/tensorflow/python/framework/importer.py +++ b/tensorflow/python/framework/importer.py @@ -572,7 +572,14 @@ def import_graph_def(graph_def, if node.name in name_to_op: raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name) if node.op not in op_dict: - raise ValueError('No op named %s in defined operations.' % node.op) + raise ValueError( + 'No op named %s in defined operations. If the Graph you are ' + 'importing uses custom ops or any parts of tf.contrib, you ' + 'should explicitly import the libraries defining those ops ' + 'before loading the Graph. Note that tf.contrib is lazily loaded ' + 'when accessed, so simply referencing (e.g.) ' + '`tf.contrib.resampler` will cause those ops to be made ' + 'available.' % node.op) op_def = op_dict[node.op] output_types = _OutputTypes(node, op_dict) From de3e9830aae0904f0d40d37e9da5b113c4a9a0f0 Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Tue, 24 Apr 2018 11:29:43 -0700 Subject: [PATCH 0674/1734] Small refactor of tf.keras aiming at centralizing reusable utilities in `utils`. PiperOrigin-RevId: 194118244 --- .../_impl/keras/applications/mobilenet.py | 1 - .../keras/_impl/keras/engine/base_layer.py | 133 ++---------------- .../keras/_impl/keras/engine/network.py | 39 ++--- .../keras/_impl/keras/engine/topology_test.py | 8 +- .../keras/layers/advanced_activations.py | 14 +- .../keras/_impl/keras/layers/convolutional.py | 4 +- .../keras/layers/convolutional_recurrent.py | 6 +- .../keras/_impl/keras/layers/embeddings.py | 6 +- .../python/keras/_impl/keras/layers/local.py | 10 +- .../python/keras/_impl/keras/layers/merge.py | 16 +-- .../python/keras/_impl/keras/layers/noise.py | 8 +- .../keras/_impl/keras/layers/recurrent.py | 26 ++-- .../keras/_impl/keras/layers/wrappers.py | 18 +-- .../keras/_impl/keras/utils/generic_utils.py | 30 ++++ .../keras/_impl/keras/utils/tf_utils.py | 80 +++++++++++ 15 files changed, 199 insertions(+), 200 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py index 12775fccecd..7b7288793de 100644 --- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py +++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py @@ -79,7 +79,6 @@ from tensorflow.python.keras._impl.keras.applications import imagenet_utils from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions from tensorflow.python.keras._impl.keras.engine import InputSpec -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs from tensorflow.python.keras._impl.keras.layers import Activation from tensorflow.python.keras._impl.keras.layers import BatchNormalization diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py index abae6c3785b..a3e78c95dc9 100644 --- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py +++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py @@ -20,7 +20,6 @@ from __future__ import print_function import collections import inspect # Necessary supplement to tf_inspect to deal with variadic args. -import re import numpy as np from six.moves import zip # pylint: disable=redefined-builtin @@ -35,6 +34,10 @@ from tensorflow.python.keras._impl.keras import constraints from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.utils import generic_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils +# A module that only depends on `keras.layers` import these from here. +from tensorflow.python.keras._impl.keras.utils.generic_utils import to_snake_case # pylint: disable=unused-import +from tensorflow.python.keras._impl.keras.utils.tf_utils import is_tensor_or_tensor_list # pylint: disable=unused-import from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import variable_scope as vs @@ -177,7 +180,8 @@ class Layer(checkpointable.CheckpointableBase): def _init_set_name(self, name, zero_based=True): if not name: self._name = unique_layer_name( - to_snake_case(self.__class__.__name__), zero_based=zero_based) + generic_utils.to_snake_case(self.__class__.__name__), + zero_based=zero_based) else: self._name = name @@ -318,7 +322,7 @@ class Layer(checkpointable.CheckpointableBase): # Requesting input-conditional updates. inputs = nest.flatten(inputs) - reachable = get_reachable_from_inputs(inputs, self.updates) + reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates) updates = [] for update in self.updates: if update in reachable: @@ -419,7 +423,7 @@ class Layer(checkpointable.CheckpointableBase): # The losses we want to return will be part of this set. # To avoid unnecessary work, we stop the search in case all of # `self.losses` have been retrieved. - reachable = get_reachable_from_inputs(inputs, self.losses) + reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses) losses = [] for loss in self.losses: if loss in reachable: @@ -639,7 +643,7 @@ class Layer(checkpointable.CheckpointableBase): if not hasattr(self, '_call_fn_args'): self._call_fn_args = estimator_util.fn_args(self.call) if ('mask' in self._call_fn_args and 'mask' not in kwargs and - not is_all_none(previous_mask)): + not generic_utils.is_all_none(previous_mask)): # The previous layer generated a mask, and mask was not explicitly pass # to __call__, hence we set previous_mask as the default value. kwargs['mask'] = previous_mask @@ -1615,9 +1619,9 @@ class Node(object): # Following 2 properties: input and output shapes. # List of shape tuples, shapes of input_tensors. - self.input_shapes = [static_shape(x) for x in input_tensors] + self.input_shapes = [backend.int_shape(x) for x in input_tensors] # List of shape tuples, shapes of output_tensors. - self.output_shapes = [static_shape(x) for x in output_tensors] + self.output_shapes = [backend.int_shape(x) for x in output_tensors] # Optional keyword arguments to layer's `call`. self.arguments = arguments @@ -1678,91 +1682,6 @@ class DeferredTensor(object): self.dtype.name) -def shape_type_conversion(fn): - """Decorator that handles tuple/TensorShape conversion. - - Used in `compute_output_shape` and `build`. - - Arguments: - fn: function to wrap. - - Returns: - Wrapped function. - """ - - def wrapper(instance, input_shape): - if input_shape is not None: - if isinstance(input_shape, list): - input_shape = [ - tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape] - else: - input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list()) - output_shape = fn(instance, input_shape) - if output_shape is not None: - if isinstance(output_shape, list): - return [tensor_shape.TensorShape(x) for x in output_shape] - return tensor_shape.TensorShape(output_shape) - - return wrapper - - -def object_list_uid(object_list): - """Creates a single string from object ids.""" - object_list = nest.flatten(object_list) - return ', '.join([str(abs(id(x))) for x in object_list]) - - -def static_shape(x): - """Get the static shape of a Tensor, or None if it is unavailable.""" - if x is None: - return None - try: - return tuple(x.get_shape().as_list()) - except ValueError: - return None - - -def get_reachable_from_inputs(inputs, targets=None): - """Returns the set of tensors/ops reachable from `inputs`. - - Stops if all targets have been found (target is optional). - - Only valid in Symbolic mode, not Eager mode. - - Args: - inputs: List of tensors. - targets: List of tensors. - - Returns: - A set of tensors reachable from the inputs (includes the inputs themselves). - """ - reachable = set(inputs) - if targets: - targets = set(targets) - queue = inputs[:] - - while queue: - x = queue.pop() - if isinstance(x, ops.Operation): - outputs = x.outputs[:] or [] - outputs += x._control_outputs - elif isinstance(x, ops.Tensor): - outputs = x.consumers() - elif isinstance(x, tf_variables.Variable): - outputs = [x.op] - else: - raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x)) - - for y in outputs: - if y not in reachable: - reachable.add(y) - queue.insert(0, y) - - if targets and targets.issubset(reachable): - return reachable - return reachable - - def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='', zero_based=False): """Makes a layer name (or arbitrary string) unique within a TensorFlow graph. @@ -1809,28 +1728,6 @@ def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='', return proposed_name -def to_snake_case(name): - intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name) - insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower() - # If the class is private the name starts with "_" which is not secure - # for creating scopes. We prefix the name with "private" in this case. - if insecure[0] != '_': - return insecure - return 'private' + insecure - - -def is_all_none(iterable_or_element): - if not isinstance(iterable_or_element, (list, tuple)): - iterable = [iterable_or_element] - else: - iterable = iterable_or_element - # We cannot use Python's `any` because the iterable may return Tensors. - for element in iterable: - if element is not None: - return False - return True - - def have_all_keras_metadata(iterable_or_element): if not isinstance(iterable_or_element, (list, tuple)): iterable = [iterable_or_element] @@ -1861,14 +1758,6 @@ def collect_previous_mask(input_tensors): return masks -def is_tensor_or_tensor_list(v): - v = nest.flatten(v) - if v and isinstance(v[0], ops.Tensor): - return True - else: - return False - - def get_default_graph_uid_map(): # TODO(fchollet): refactor this into backend. graph = ops.get_default_graph() diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py index 4127c781eb4..9f8ee129aac 100644 --- a/tensorflow/python/keras/_impl/keras/engine/network.py +++ b/tensorflow/python/keras/_impl/keras/engine/network.py @@ -32,10 +32,11 @@ from tensorflow.python.eager import context from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape -from tensorflow.python.keras._impl.keras import backend as K +from tensorflow.python.keras._impl.keras import backend from tensorflow.python.keras._impl.keras.engine import base_layer from tensorflow.python.keras._impl.keras.engine import saving from tensorflow.python.keras._impl.keras.utils import generic_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary from tensorflow.python.platform import tf_logging as logging @@ -252,8 +253,8 @@ class Network(base_layer.Layer): for x in self.inputs: mask = x._keras_mask if hasattr(x, '_keras_mask') else None # pylint: disable=protected-access masks.append(mask) - mask_cache_key = (base_layer.object_list_uid(self.inputs) + '_' + - base_layer.object_list_uid(masks)) + mask_cache_key = (generic_utils.object_list_uid(self.inputs) + '_' + + generic_utils.object_list_uid(masks)) masks = [] for x in self.outputs: mask = x._keras_mask if hasattr(x, '_keras_mask') else None # pylint: disable=protected-access @@ -274,7 +275,7 @@ class Network(base_layer.Layer): self.input_names.append(layer.name) if layer.is_placeholder: self._feed_input_names.append(layer.name) - self._feed_input_shapes.append(K.int_shape(self.inputs[i])) + self._feed_input_shapes.append(backend.int_shape(self.inputs[i])) # layer.input gives an error in eager mode if not context.executing_eagerly(): self._feed_inputs.append(layer.input) @@ -373,7 +374,7 @@ class Network(base_layer.Layer): weights = [] for layer in self.layers: weights += layer.weights - return K.batch_get_value(weights) + return backend.batch_get_value(weights) def set_weights(self, weights): """Sets the weights of the model. @@ -389,7 +390,7 @@ class Network(base_layer.Layer): for sw, w in zip(layer.weights, layer_weights): tuples.append((sw, w)) weights = weights[num_param:] - K.batch_set_value(tuples) + backend.batch_set_value(tuples) def compute_mask(self, inputs, mask): if not self._is_graph_network: @@ -400,8 +401,8 @@ class Network(base_layer.Layer): masks = [None for _ in range(len(inputs))] else: masks = generic_utils.to_list(mask) - cache_key = (base_layer.object_list_uid(inputs) - + '_' + base_layer.object_list_uid(masks)) + cache_key = (generic_utils.object_list_uid(inputs) + + '_' + generic_utils.object_list_uid(masks)) if cache_key in self._output_mask_cache: return self._output_mask_cache[cache_key] else: @@ -515,7 +516,7 @@ class Network(base_layer.Layer): relevant_inputs += inputs else: relevant_inputs.append(inputs) - reachable = base_layer.get_reachable_from_inputs(relevant_inputs, updates) + reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates) relevant_conditional_updates = [x for x in updates if x in reachable] unconditional_updates = [ x for x in updates if x._unconditional_update] # pylint: disable=protected-access @@ -552,7 +553,7 @@ class Network(base_layer.Layer): relevant_inputs += inputs else: relevant_inputs.append(inputs) - reachable = base_layer.get_reachable_from_inputs(relevant_inputs, losses) + reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, losses) relevant_conditional_losses = [x for x in losses if x in reachable] unconditional_losses = [ x for x in losses if x._unconditional_loss] # pylint: disable=protected-access @@ -634,8 +635,8 @@ class Network(base_layer.Layer): if not context.executing_eagerly(): # Try to retrieve cached outputs if the layer has already been called # on these exact inputs. - cache_key = (base_layer.object_list_uid(inputs) - + '_' + base_layer.object_list_uid(masks)) + cache_key = (generic_utils.object_list_uid(inputs) + + '_' + generic_utils.object_list_uid(masks)) if cache_key in self._output_tensor_cache: # Cache hit. return self._output_tensor_cache[cache_key] @@ -667,7 +668,7 @@ class Network(base_layer.Layer): ': model has ' + str(len(self._input_layers)) + ' tensor inputs.') - cache_key = base_layer.object_list_uid(input_shapes) + cache_key = generic_utils.object_list_uid(input_shapes) if cache_key not in self._output_shape_cache: # Cache miss. We have to run the network graph manually (recursive calls # to `compute_output_shape`). @@ -856,7 +857,7 @@ class Network(base_layer.Layer): for x in self.outputs: assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x) tensor, mask = tensor_map[str(id(x))] - output_shapes.append(base_layer.static_shape(x)) + output_shapes.append(backend.int_shape(x)) output_tensors.append(tensor) output_masks.append(mask) @@ -870,14 +871,14 @@ class Network(base_layer.Layer): if not context.executing_eagerly(): # Update cache; # keys are based on ids on input tensors and inputs masks. - cache_key = (base_layer.object_list_uid(inputs) - + '_' + base_layer.object_list_uid(masks)) + cache_key = (generic_utils.object_list_uid(inputs) + + '_' + generic_utils.object_list_uid(masks)) self._output_tensor_cache[cache_key] = output_tensors self._output_mask_cache[cache_key] = output_masks if output_shapes is not None: - input_shapes = [base_layer.static_shape(x) for x in inputs] - cache_key = base_layer.object_list_uid(input_shapes) + input_shapes = [backend.int_shape(x) for x in inputs] + cache_key = generic_utils.object_list_uid(input_shapes) self._output_shape_cache[cache_key] = output_shapes return output_tensors, output_masks @@ -1338,7 +1339,7 @@ class Network(base_layer.Layer): 'class_name': self.__class__.__name__, 'config': config, 'keras_version': keras_version, - 'backend': K.backend() + 'backend': backend.backend() } return model_config diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py index 49cc1cd3b38..6993a042890 100644 --- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py @@ -964,16 +964,16 @@ class GraphUtilsTest(test.TestCase): x_5 = x_3 * pl_1 self.assertEqual( - keras.engine.base_layer.get_reachable_from_inputs([pl_1]), + keras.utils.tf_utils.get_reachable_from_inputs([pl_1]), {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op}) self.assertEqual( - keras.engine.base_layer.get_reachable_from_inputs([pl_1, pl_2]), + keras.utils.tf_utils.get_reachable_from_inputs([pl_1, pl_2]), {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op}) self.assertEqual( - keras.engine.base_layer.get_reachable_from_inputs([pl_3]), + keras.utils.tf_utils.get_reachable_from_inputs([pl_3]), {pl_3, x_3, x_5, x_3.op, x_5.op}) self.assertEqual( - keras.engine.base_layer.get_reachable_from_inputs([x_3]), + keras.utils.tf_utils.get_reachable_from_inputs([x_3]), {x_3, x_5, x_5.op}) diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py index 11ca89d625b..89931db3c07 100644 --- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py +++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py @@ -25,7 +25,7 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import tf_export @@ -64,7 +64,7 @@ class LeakyReLU(Layer): base_config = super(LeakyReLU, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -119,7 +119,7 @@ class PReLU(Layer): else: self.shared_axes = list(shared_axes) - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): param_shape = list(input_shape[1:]) self.param_broadcast = [False] * len(param_shape) @@ -162,7 +162,7 @@ class PReLU(Layer): base_config = super(PReLU, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -201,7 +201,7 @@ class ELU(Layer): base_config = super(ELU, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -241,7 +241,7 @@ class ThresholdedReLU(Layer): base_config = super(ThresholdedReLU, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -275,6 +275,6 @@ class Softmax(Layer): base_config = super(Softmax, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py index 12b965587f5..9971f127732 100644 --- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py +++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py @@ -28,7 +28,6 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion # imports for backwards namespace compatibility # pylint: disable=unused-import from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D @@ -39,6 +38,7 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D # pylint: enable=unused-import from tensorflow.python.keras._impl.keras.utils import conv_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import nn from tensorflow.python.ops import nn_ops @@ -1731,7 +1731,7 @@ class DepthwiseConv2D(Conv2D): return outputs - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if self.data_format == 'channels_first': rows = input_shape[2] diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py index 6b2a1d98fe7..be25bbc043a 100644 --- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py +++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py @@ -28,11 +28,11 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask from tensorflow.python.keras._impl.keras.layers.recurrent import RNN from tensorflow.python.keras._impl.keras.utils import conv_utils from tensorflow.python.keras._impl.keras.utils import generic_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.util.tf_export import tf_export @@ -168,7 +168,7 @@ class ConvRNN2D(RNN): self.input_spec = [InputSpec(ndim=5)] self.states = None - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if isinstance(input_shape, list): input_shape = input_shape[0] @@ -209,7 +209,7 @@ class ConvRNN2D(RNN): for _ in range(2)] return output_shape - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): # Note input_shape will be list of shapes of initial states and # constants if these are passed in __call__. diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py index 07b8726b859..2b353ac007a 100644 --- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py +++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py @@ -23,7 +23,7 @@ from tensorflow.python.keras._impl.keras import constraints from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import embedding_ops from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import tf_export @@ -114,7 +114,7 @@ class Embedding(Layer): self.mask_zero = mask_zero self.input_length = input_length - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): self.embeddings = self.add_weight( shape=(self.input_dim, self.output_dim), @@ -130,7 +130,7 @@ class Embedding(Layer): else: return math_ops.not_equal(inputs, 0) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if self.input_length is None: return input_shape + (self.output_dim,) diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py index 13d96e93922..caae820fb3a 100644 --- a/tensorflow/python/keras/_impl/keras/layers/local.py +++ b/tensorflow/python/keras/_impl/keras/layers/local.py @@ -25,8 +25,8 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion from tensorflow.python.keras._impl.keras.utils import conv_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.util.tf_export import tf_export @@ -120,7 +120,7 @@ class LocallyConnected1D(Layer): self.bias_constraint = constraints.get(bias_constraint) self.input_spec = InputSpec(ndim=3) - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): input_dim = input_shape[2] if input_dim is None: @@ -148,7 +148,7 @@ class LocallyConnected1D(Layer): self.input_spec = InputSpec(ndim=3, axes={2: input_dim}) self.built = True - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0], self.padding, self.strides[0]) @@ -307,7 +307,7 @@ class LocallyConnected2D(Layer): self.bias_constraint = constraints.get(bias_constraint) self.input_spec = InputSpec(ndim=4) - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): if self.data_format == 'channels_last': input_row, input_col = input_shape[1:-1] @@ -350,7 +350,7 @@ class LocallyConnected2D(Layer): self.input_spec = InputSpec(ndim=4, axes={-1: input_filter}) self.built = True - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if self.data_format == 'channels_first': rows = input_shape[2] diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py index 7c87e6c0671..2b6cf7c8a94 100644 --- a/tensorflow/python/keras/_impl/keras/layers/merge.py +++ b/tensorflow/python/keras/_impl/keras/layers/merge.py @@ -22,7 +22,7 @@ from __future__ import print_function from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras.engine.base_layer import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn @@ -83,7 +83,7 @@ class _Merge(Layer): output_shape.append(i) return tuple(output_shape) - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): # Used purely for shape validation. if not isinstance(input_shape, list): @@ -181,7 +181,7 @@ class _Merge(Layer): else: return self._merge_function(inputs) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if input_shape[0] is None: output_shape = None @@ -274,7 +274,7 @@ class Subtract(_Merge): ``` """ - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): super(Subtract, self).build(input_shape) if len(input_shape) != 2: @@ -370,7 +370,7 @@ class Concatenate(_Merge): self.supports_masking = True self._reshape_required = False - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): # Used purely for shape validation. if not isinstance(input_shape, list) or len(input_shape) < 2: @@ -392,7 +392,7 @@ class Concatenate(_Merge): def _merge_function(self, inputs): return K.concatenate(inputs, axis=self.axis) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if not isinstance(input_shape, list): raise ValueError('A `Concatenate` layer should be called ' @@ -478,7 +478,7 @@ class Dot(_Merge): self.supports_masking = True self._reshape_required = False - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): # Used purely for shape validation. if not isinstance(input_shape, list) or len(input_shape) != 2: @@ -523,7 +523,7 @@ class Dot(_Merge): output = K.batch_dot(x1, x2, axes) return output - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if not isinstance(input_shape, list) or len(input_shape) != 2: raise ValueError('A `Dot` layer should be called ' diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py index 72dc7a1ff8b..addac5b1374 100644 --- a/tensorflow/python/keras/_impl/keras/layers/noise.py +++ b/tensorflow/python/keras/_impl/keras/layers/noise.py @@ -22,7 +22,7 @@ import numpy as np from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import tf_export @@ -69,7 +69,7 @@ class GaussianNoise(Layer): base_config = super(GaussianNoise, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -116,7 +116,7 @@ class GaussianDropout(Layer): base_config = super(GaussianDropout, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape @@ -188,6 +188,6 @@ class AlphaDropout(Layer): base_config = super(AlphaDropout, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py index f53db987ff3..f6d6e1391c8 100644 --- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py +++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py @@ -31,8 +31,8 @@ from tensorflow.python.keras._impl.keras import initializers from tensorflow.python.keras._impl.keras import regularizers from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion -from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg +from tensorflow.python.keras._impl.keras.utils import generic_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import state_ops @@ -107,7 +107,7 @@ class StackedRNNCells(Layer): # Call the cells in order and store the returned states. new_nested_states = [] for cell, states in zip(self.cells, nested_states): - if has_arg(cell.call, 'constants'): + if generic_utils.has_arg(cell.call, 'constants'): inputs, states = cell.call(inputs, states, constants=constants, **kwargs) else: @@ -122,14 +122,14 @@ class StackedRNNCells(Layer): states += cell_states return inputs, states - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): if isinstance(input_shape, list): constants_shape = input_shape[1:] input_shape = input_shape[0] for cell in self.cells: if isinstance(cell, Layer): - if has_arg(cell.call, 'constants'): + if generic_utils.has_arg(cell.call, 'constants'): cell.build([input_shape] + constants_shape) else: cell.build(input_shape) @@ -429,7 +429,7 @@ class RNN(Layer): def states(self, states): self._states = states - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if isinstance(input_shape, list): input_shape = input_shape[0] @@ -461,7 +461,7 @@ class RNN(Layer): else: return output_mask - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): # Note input_shape will be list of shapes of initial states and # constants if these are passed in __call__. @@ -609,11 +609,11 @@ class RNN(Layer): 'or `batch_shape` argument to your Input layer.') kwargs = {} - if has_arg(self.cell.call, 'training'): + if generic_utils.has_arg(self.cell.call, 'training'): kwargs['training'] = training if constants: - if not has_arg(self.cell.call, 'constants'): + if not generic_utils.has_arg(self.cell.call, 'constants'): raise ValueError('RNN cell does not support constants') def step(inputs, states): @@ -884,7 +884,7 @@ class SimpleRNNCell(Layer): self._dropout_mask = None self._recurrent_dropout_mask = None - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): self.kernel = self.add_weight( shape=(input_shape[-1], self.units), @@ -1287,7 +1287,7 @@ class GRUCell(Layer): self._dropout_mask = None self._recurrent_dropout_mask = None - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): input_dim = input_shape[-1] self.kernel = self.add_weight( @@ -1824,7 +1824,7 @@ class LSTMCell(Layer): self._dropout_mask = None self._recurrent_dropout_mask = None - @shape_type_conversion + @tf_utils.shape_type_conversion def build(self, input_shape): input_dim = input_shape[-1] self.kernel = self.add_weight( @@ -2388,7 +2388,7 @@ class Recurrent(Layer): self.dropout = 0 self.recurrent_dropout = 0 - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): if isinstance(input_shape, list): input_shape = input_shape[0] diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py index 9aee5f03b6d..34a8eeeb5b5 100644 --- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py +++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py @@ -23,11 +23,10 @@ import copy from tensorflow.python.framework import tensor_shape from tensorflow.python.keras._impl.keras import backend as K -from tensorflow.python.keras._impl.keras.engine import base_layer from tensorflow.python.keras._impl.keras.engine import InputSpec from tensorflow.python.keras._impl.keras.engine import Layer -from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion -from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg +from tensorflow.python.keras._impl.keras.utils import generic_utils +from tensorflow.python.keras._impl.keras.utils import tf_utils from tensorflow.python.ops import array_ops from tensorflow.python.util.tf_export import tf_export @@ -183,7 +182,7 @@ class TimeDistributed(Wrapper): def call(self, inputs, training=None, mask=None): kwargs = {} - if has_arg(self.layer.call, 'training'): + if generic_utils.has_arg(self.layer.call, 'training'): kwargs['training'] = training uses_learning_phase = False # pylint: disable=redefined-outer-name @@ -213,7 +212,7 @@ class TimeDistributed(Wrapper): input_length = array_ops.shape(inputs)[1] # Shape: (num_samples * timesteps, ...). And track the # transformation in self._input_map. - input_uid = base_layer.object_list_uid(inputs) + input_uid = generic_utils.object_list_uid(inputs) inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:]) self._input_map[input_uid] = inputs # (num_samples * timesteps, ...) @@ -305,7 +304,7 @@ class Bidirectional(Wrapper): self.forward_layer.set_weights(weights[:nw // 2]) self.backward_layer.set_weights(weights[nw // 2:]) - @shape_type_conversion + @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): output_shape = tuple(self.forward_layer.compute_output_shape( input_shape).as_list()) @@ -383,12 +382,13 @@ class Bidirectional(Wrapper): def call(self, inputs, training=None, mask=None, initial_state=None): kwargs = {} - if has_arg(self.layer.call, 'training'): + if generic_utils.has_arg(self.layer.call, 'training'): kwargs['training'] = training - if has_arg(self.layer.call, 'mask'): + if generic_utils.has_arg(self.layer.call, 'mask'): kwargs['mask'] = mask - if initial_state is not None and has_arg(self.layer.call, 'initial_state'): + if initial_state is not None and generic_utils.has_arg( + self.layer.call, 'initial_state'): forward_state = initial_state[:len(initial_state) // 2] backward_state = initial_state[len(initial_state) // 2:] y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs) diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py index 3bbe87f92d8..db184d278cf 100644 --- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py +++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py @@ -21,6 +21,7 @@ import binascii import codecs import marshal import os +import re import sys import time import types as python_types @@ -28,6 +29,7 @@ import types as python_types import numpy as np import six +from tensorflow.python.util import nest from tensorflow.python.util import tf_decorator from tensorflow.python.util import tf_inspect from tensorflow.python.util.tf_export import tf_export @@ -526,3 +528,31 @@ def to_list(x): if isinstance(x, list): return x return [x] + + +def object_list_uid(object_list): + """Creates a single string from object ids.""" + object_list = nest.flatten(object_list) + return ', '.join([str(abs(id(x))) for x in object_list]) + + +def to_snake_case(name): + intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name) + insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower() + # If the class is private the name starts with "_" which is not secure + # for creating scopes. We prefix the name with "private" in this case. + if insecure[0] != '_': + return insecure + return 'private' + insecure + + +def is_all_none(iterable_or_element): + if not isinstance(iterable_or_element, (list, tuple)): + iterable = [iterable_or_element] + else: + iterable = iterable_or_element + # We cannot use Python's `any` because the iterable may return Tensors. + for element in iterable: + if element is not None: + return False + return True diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py index 8da5f777773..162e5b2cd65 100644 --- a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py +++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py @@ -17,9 +17,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.framework import ops from tensorflow.python.framework import smart_cond as smart_module +from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import variables +from tensorflow.python.util import nest def smart_cond(pred, true_fn=None, false_fn=None, name=None): @@ -72,3 +75,80 @@ def constant_value(pred): if isinstance(pred, variables.Variable): return None return smart_module.smart_constant_value(pred) + + +def is_tensor_or_tensor_list(v): + v = nest.flatten(v) + if v and isinstance(v[0], ops.Tensor): + return True + else: + return False + + +def get_reachable_from_inputs(inputs, targets=None): + """Returns the set of tensors/ops reachable from `inputs`. + + Stops if all targets have been found (target is optional). + + Only valid in Symbolic mode, not Eager mode. + + Args: + inputs: List of tensors. + targets: List of tensors. + + Returns: + A set of tensors reachable from the inputs (includes the inputs themselves). + """ + reachable = set(inputs) + if targets: + targets = set(targets) + queue = inputs[:] + + while queue: + x = queue.pop() + if isinstance(x, ops.Operation): + outputs = x.outputs[:] or [] + outputs += x._control_outputs # pylint: disable=protected-access + elif isinstance(x, ops.Tensor): + outputs = x.consumers() + elif isinstance(x, variables.Variable): + outputs = [x.op] + else: + raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x)) + + for y in outputs: + if y not in reachable: + reachable.add(y) + queue.insert(0, y) + + if targets and targets.issubset(reachable): + return reachable + return reachable + + +def shape_type_conversion(fn): + """Decorator that handles tuple/TensorShape conversion. + + Used in `compute_output_shape` and `build`. + + Arguments: + fn: function to wrap. + + Returns: + Wrapped function. + """ + + def wrapper(instance, input_shape): + if input_shape is not None: + if isinstance(input_shape, list): + input_shape = [ + tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape] + else: + input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list()) + output_shape = fn(instance, input_shape) + if output_shape is not None: + if isinstance(output_shape, list): + return [tensor_shape.TensorShape(x) for x in output_shape] + return tensor_shape.TensorShape(output_shape) + + return wrapper From c2b1eebe7e256dda88beb91c7fa7662e01d12f9b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 11:43:35 -0700 Subject: [PATCH 0675/1734] Updating tests in constant_folding_test.cc so that the tests evaluate the original and optimized graphs and check that the output is the same. PiperOrigin-RevId: 194120424 --- .../optimizers/constant_folding_test.cc | 80 ++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc index 1acce05909c..32dca29e12d 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc @@ -520,6 +520,25 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) { EXPECT_EQ("Mul", node.op()) << node.name(); } } + + const std::vector fetch = {"mul_0", "mul_4", "mul_8"}; + auto x_known_t = GenerateRandomTensor(TensorShape({2, 2})); + auto x_partially_unknown_t = + GenerateRandomTensor(TensorShape({3, 4})); + auto x_unknown_t = GenerateRandomTensor(TensorShape({5, 7})); + auto expected_tensors = + EvaluateNodes(item.graph, fetch, + {{"x_known", x_known_t}, + {"x_partially_unknown", x_partially_unknown_t}, + {"x_unknown", x_unknown_t}}); + EXPECT_EQ(fetch.size(), expected_tensors.size()); + auto tensors = EvaluateNodes(output, fetch, + {{"x_known", x_known_t}, + {"x_partially_unknown", x_partially_unknown_t}, + {"x_unknown", x_unknown_t}}); + EXPECT_EQ(fetch.size(), tensors.size()); + for (int i = 0; i < tensors.size(); i++) + test::ExpectTensorNear(expected_tensors[i], tensors[i], 1e-5); } TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) { @@ -572,6 +591,20 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) { EXPECT_TRUE(IsControlInput(node.input(1))); } } + const std::vector fetch = {"addn1"}; + auto x_partially_unknown_t = + GenerateRandomTensor(TensorShape({2, 2})); + auto x_unknown_t = GenerateRandomTensor(TensorShape({2, 2})); + auto expected_tensors = + EvaluateNodes(item.graph, fetch, + {{"x_partially_unknown", x_partially_unknown_t}, + {"x_unknown", x_unknown_t}}); + EXPECT_EQ(1, expected_tensors.size()); + auto tensors = EvaluateNodes(output, fetch, + {{"x_partially_unknown", x_partially_unknown_t}, + {"x_unknown", x_unknown_t}}); + EXPECT_EQ(1, tensors.size()); + test::ExpectTensorNear(expected_tensors[0], tensors[0], 1e-5); } TEST_F(ConstantFoldingTest, CreateConstNodes) { @@ -1064,6 +1097,20 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) { } } EXPECT_EQ(9, found); + + auto v1_t = GenerateRandomTensor(TensorShape({3, 4})); + auto v2_t = GenerateRandomTensor(TensorShape({5, 6})); + auto v3_t = GenerateRandomTensor(TensorShape({4, 6})); + const std::vector fetch_nodes = {"i1a", "i1b", "i2a", "i2b", + "i2c", "i3a", "i3b"}; + auto tensors_expected = EvaluateNodes( + item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors_expected.size()); + auto tensors = EvaluateNodes(output, fetch_nodes, + {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors.size()); + for (int i = 0; i < fetch_nodes.size(); i++) + test::ExpectTensorEqual(tensors_expected[i], tensors[i]); } TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) { @@ -1930,6 +1977,14 @@ TEST_F(ConstantFoldingTest, Packing) { Status status = optimizer.Optimize(nullptr, item, &output); TF_EXPECT_OK(status); + const std::vector fetch_nodes = {"i1", "i2"}; + auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes); + EXPECT_EQ(fetch_nodes.size(), tensors_expected.size()); + auto tensors = EvaluateNodes(output, fetch_nodes); + EXPECT_EQ(fetch_nodes.size(), tensors.size()); + for (int i = 0; i < fetch_nodes.size(); i++) + test::ExpectTensorNear(tensors_expected[i], tensors[i], 1e-5); + // Make sure that the representation of the folded constant is space // efficient: in particular, the whole message should be smaller than 8k // (the size needed to naively encode 1000 floats folded twice). @@ -1965,6 +2020,13 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) { Status status = optimizer.Optimize(nullptr, item, &output); TF_EXPECT_OK(status); + std::vector fetch_nodes = {"o1", "o2", "p1", "p2"}; + auto a_t = GenerateRandomTensor(TensorShape({1, 5})); + auto g_t = GenerateRandomTensor(TensorShape({1})); + auto tensors_expected = + EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}, {"g", g_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors_expected.size()); + // Run a second time to make sure the optimization is idempotent. item.graph.Swap(&output); status = optimizer.Optimize(nullptr, item, &output); @@ -2005,6 +2067,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) { } } EXPECT_EQ(6, found); + + auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}, {"g", g_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors.size()); + for (int i = 0; i < fetch_nodes.size(); i++) + test::ExpectTensorEqual(tensors_expected[i], tensors[i]); } TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) { @@ -2024,6 +2091,11 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) { GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); + std::vector fetch_nodes = {"o1", "o2"}; + auto a_t = GenerateRandomTensor(TensorShape({2, 2})); + auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors_expected.size()); + ConstantFolding optimizer(nullptr /* cpu_device */); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); @@ -2078,6 +2150,10 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) { } } EXPECT_EQ(7, found); + auto tensors = EvaluateNodes(output, fetch_nodes, {{"a", a_t}}); + EXPECT_EQ(fetch_nodes.size(), tensors.size()); + for (int i = 0; i < fetch_nodes.size(); i++) + test::ExpectTensorEqual(tensors_expected[i], tensors[i]); } TEST_F(ConstantFoldingTest, MaterializeReductionIndices) { @@ -2539,6 +2615,8 @@ TEST_F(ConstantFoldingTest, TrivialPack) { EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape()); } +// The test does not evalute the optimized and original graphs to check if their +// outputs are the same. See b/78233179. TEST_F(ConstantFoldingTest, Enter) { GrapplerItem item; AttrValue frame_name; @@ -2555,7 +2633,7 @@ TEST_F(ConstantFoldingTest, Enter) { value_tensor.AsProtoTensorContent(value.mutable_tensor()); GraphDef& graph = item.graph; - AddNode("x", "Placeholder", {}, {{"T", type}}, &graph); + AddNode("x", "Placeholder", {}, {{"dtype", type}}, &graph); AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph); AddNode("enter1", "Enter", {"x"}, {{"T", type}, From 9992042548ff268ac97ac3ebf1c584d380b0c106 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 11:46:17 -0700 Subject: [PATCH 0676/1734] Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 194120868 --- tensorflow/go/op/wrappers.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index d038846c4f2..4d91f2b68e2 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -9602,6 +9602,14 @@ func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr { } } +// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value. +// If not specified, defaults to true +func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr { + return func(m optionalAttr) { + m["update_slots"] = value + } +} + // Update '*var' according to the adagrad scheme. // // accum += grad * grad @@ -10676,6 +10684,14 @@ func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagrad } } +// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value. +// If not specified, defaults to true +func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr { + return func(m optionalAttr) { + m["update_slots"] = value + } +} + // Update relevant entries in '*var' and '*accum' according to the adagrad scheme. // // That is for rows we have grad for, we update var and accum as follows: From e6e43da77e9be2e7e455d94e9724983a263f310a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 11:49:35 -0700 Subject: [PATCH 0677/1734] Clarify error encountered when serializing critical_section_executions is a warning. PiperOrigin-RevId: 194121508 --- tensorflow/python/framework/meta_graph.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py index 391b17720c6..923e76fc9c8 100644 --- a/tensorflow/python/framework/meta_graph.py +++ b/tensorflow/python/framework/meta_graph.py @@ -439,9 +439,10 @@ def add_collection_def(meta_graph_def, key, graph=None, else: getattr(col_def, kind).value.extend([x for x in collection_list]) except Exception as e: # pylint: disable=broad-except - logging.warning("Error encountered when serializing %s.\n" + logging.warning("Issue encountered when serializing %s.\n" "Type is unsupported, or the types of the items don't " - "match field type in CollectionDef.\n%s", key, str(e)) + "match field type in CollectionDef. Note this is a warning " + "and probably safe to ignore.\n%s", key, str(e)) if key in meta_graph_def.collection_def: del meta_graph_def.collection_def[key] return From 7afe5df6b12309e20b471ce52a2549e6d6ea1745 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 12:45:45 -0700 Subject: [PATCH 0678/1734] Extract OptimizeGraph function in meta-optimizer. PiperOrigin-RevId: 194129729 --- .../optimizers/constant_folding_test.cc | 1 - .../grappler/optimizers/meta_optimizer.cc | 287 +++++++++--------- .../core/grappler/optimizers/meta_optimizer.h | 32 +- 3 files changed, 180 insertions(+), 140 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc index 32dca29e12d..25693c5c60b 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc @@ -2528,7 +2528,6 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) { ConstantFolding optimizer(nullptr /* cpu_device */); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); - LOG(INFO) << output.DebugString(); TF_EXPECT_OK(status); EXPECT_EQ(8, output.node_size()); for (const auto& node : output.node()) { diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 335fb403f18..c98eef1a6a5 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -36,6 +36,9 @@ namespace tensorflow { namespace grappler { namespace { + +constexpr int kDefaultNumberOfIterations = 1; + int64 NumEdges(const GraphDef& graph) { int64 num_edges = 0; for (const auto& node : graph.node()) { @@ -50,144 +53,144 @@ string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) { NumEdges(after), " edges (", NumEdges(after) - NumEdges(before), ")"); } -} // namespace -std::unique_ptr MetaOptimizer::NewOptimizer( - const string& optimizer) { - std::unique_ptr graph_optimizer; - if (optimizer == "pruning") { - graph_optimizer.reset(new ModelPruner()); - } - if (optimizer == "function") { - graph_optimizer.reset(new FunctionOptimizer(cfg_.function_optimization())); - } - if (optimizer == "constfold") { - graph_optimizer.reset(new ConstantFolding(cpu_device_)); - } - if (optimizer == "layout") { - graph_optimizer.reset(new LayoutOptimizer()); - } - if (optimizer == "memory") { - graph_optimizer.reset(new MemoryOptimizer(RewriterConfig::MANUAL)); - } - if (optimizer == "arithmetic") { - graph_optimizer.reset( - new ArithmeticOptimizer(cfg_.arithmetic_optimization())); - } - if (optimizer == "autoparallel") { - graph_optimizer.reset( - new AutoParallel(cfg_.auto_parallel().num_replicas())); - } - if (optimizer == "loop") { - graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization())); - } - if (optimizer == "dependency") { - graph_optimizer.reset( - new DependencyOptimizer(cfg_.dependency_optimization())); - } - if (optimizer == "debug_stripper") { - graph_optimizer.reset(new DebugStripper()); - } - return graph_optimizer; +int NumIterations(const RewriterConfig& cfg) { + return cfg.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS + ? kDefaultNumberOfIterations + : cfg.meta_optimizer_iterations(); } -Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, - GraphDef* optimized_graph) { - std::vector> optimizers; - if (cfg_.optimizers().empty()) { - if (!cfg_.disable_model_pruning()) { - optimizers.push_back(std::unique_ptr(new ModelPruner())); - } - if (cfg_.function_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new FunctionOptimizer(cfg_.function_optimization()))); - } - if (cfg_.debug_stripper() == RewriterConfig::ON) { - optimizers.push_back( - std::unique_ptr(new DebugStripper())); - } - if (cfg_.constant_folding() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new ConstantFolding(cfg_.constant_folding(), cpu_device_))); - } - if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new ArithmeticOptimizer(cfg_.arithmetic_optimization()))); - } - if (cfg_.loop_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new LoopOptimizer(cfg_.loop_optimization()))); - } - if (cfg_.dependency_optimization() != RewriterConfig::OFF) { - optimizers.push_back(std::unique_ptr( - new DependencyOptimizer(cfg_.dependency_optimization()))); - } - if (cfg_.layout_optimizer() != RewriterConfig::OFF) { - optimizers.push_back( - std::unique_ptr(new LayoutOptimizer())); - } - if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { - if (cfg_.memory_optimizer_target_node_name_scope().empty()) { - optimizers.push_back(std::unique_ptr( - // Use the default target node name prefix "gradients/" - new MemoryOptimizer(cfg_.memory_optimization()))); - } else { - optimizers.push_back( - std::unique_ptr(new MemoryOptimizer( - cfg_.memory_optimization(), - cfg_.memory_optimizer_target_node_name_scope()))); - } - } - if (cfg_.auto_parallel().enable()) { - optimizers.push_back(std::unique_ptr( - new AutoParallel(cfg_.auto_parallel().num_replicas()))); - } - } else { - const std::set available_optimizers = { - "pruning", "function", "constfold", "layout", - "memory", "autoparallel", "arithmetic", "loop", - "dependency", "debug_stripper"}; - std::vector custom_optimizer_names; - for (const auto& optimizer_name : cfg_.optimizers()) { - if (available_optimizers.find(optimizer_name) != - available_optimizers.end()) { - optimizers.push_back(NewOptimizer(optimizer_name)); - } else { - custom_optimizer_names.push_back(optimizer_name); - } - } - // Now run the custom optimizers. - for (const auto& optimizer_name : custom_optimizer_names) { - std::unique_ptr opt = - CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); - if (opt == nullptr) continue; - TF_RETURN_IF_ERROR(opt->Init()); - optimizers.push_back(std::move(opt)); +// Check if optimizer is allowed to run only once. +bool IsRunOnceOptimizer(const string& name) { return name == "layout"; } + +} // namespace + +#define MK_OPT(NAME, VALUE) \ + if (optimizer == NAME) return std::unique_ptr(VALUE) + +std::unique_ptr MetaOptimizer::MakeNewOptimizer( + const string& optimizer) const { + MK_OPT("pruning", new ModelPruner()); + MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization())); + MK_OPT("constfold", new ConstantFolding(cpu_device_)); + MK_OPT("layout", new LayoutOptimizer()); + MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL)); + MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization())); + MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas())); + MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization())); + MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization())); + MK_OPT("debug_stripper", new DebugStripper()); + + return std::unique_ptr(); +} + +#undef MK_OPT + +Status MetaOptimizer::InitializeOptimizers( + std::vector>* optimizers) const { + if (!cfg_.disable_model_pruning()) { + optimizers->emplace_back(new ModelPruner()); + } + if (cfg_.function_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new FunctionOptimizer(cfg_.function_optimization())); + } + if (cfg_.debug_stripper() == RewriterConfig::ON) { + optimizers->emplace_back(new DebugStripper()); + } + if (cfg_.constant_folding() != RewriterConfig::OFF) { + optimizers->emplace_back( + new ConstantFolding(cfg_.constant_folding(), cpu_device_)); + } + if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new ArithmeticOptimizer(cfg_.arithmetic_optimization())); + } + if (cfg_.loop_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back(new LoopOptimizer(cfg_.loop_optimization())); + } + if (cfg_.dependency_optimization() != RewriterConfig::OFF) { + optimizers->emplace_back( + new DependencyOptimizer(cfg_.dependency_optimization())); + } + if (cfg_.layout_optimizer() != RewriterConfig::OFF) { + optimizers->emplace_back(new LayoutOptimizer()); + } + if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) { + if (cfg_.memory_optimizer_target_node_name_scope().empty()) { + optimizers->emplace_back( + // Use the default target node name prefix "gradients/" + new MemoryOptimizer(cfg_.memory_optimization())); + } else { + optimizers->emplace_back( + new MemoryOptimizer(cfg_.memory_optimization(), + cfg_.memory_optimizer_target_node_name_scope())); } } + if (cfg_.auto_parallel().enable()) { + optimizers->emplace_back( + new AutoParallel(cfg_.auto_parallel().num_replicas())); + } + return Status::OK(); +} + +Status MetaOptimizer::InitializeOptimizersByName( + std::vector>* optimizers) const { + for (const string& optimizer_name : cfg_.optimizers()) { + auto optimizer = MakeNewOptimizer(optimizer_name); + if (optimizer) { + VLOG(2) << "Registered default graph optimizer: " << optimizer_name; + optimizers->push_back(std::move(optimizer)); + continue; + } + + auto custom_optimizer = + CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name); + + if (custom_optimizer) { + VLOG(2) << "Registered custom graph optimizer: " << optimizer_name; + TF_RETURN_IF_ERROR(custom_optimizer->Init()); + optimizers->push_back(std::move(custom_optimizer)); + } else { + VLOG(2) << "Can't register an optimizer by name: " << optimizer_name; + } + } + return Status::OK(); +} + +Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph) { + std::vector> optimizers; + if (cfg_.optimizers().empty()) { + TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers)); + } else { + TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers)); + } + + VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id + << " num_optimizers=" << optimizers.size(); if (optimizers.empty()) { + VLOG(3) << "Skip graph optimization, no optimizers registered"; *optimized_graph = item.graph; return Status::OK(); } - // Some optimizers should be run only once. - const std::set run_once_optimizers = {"layout"}; - bool already_optimized = false; - const int num_iterations = - cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS - ? 1 - : cfg_.meta_optimizer_iterations(); + // Invariant: optimized_graph contains the most recently optimized version of + // the graph. GrapplerItem optimized_item = item; optimized_graph->Swap(&optimized_item.graph); - for (int iteration = 0; iteration < num_iterations; ++iteration) { - VLOG(1) << "Starting optimization iteration " << iteration + 1; + + bool is_optimized = false; + GraphOptimizationResult optimization_result(item.id); + + for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) { + VLOG(4) << "Starting optimization iteration " << iteration + 1; + for (const auto& optimizer : optimizers) { - // Invariant: optimized_graph contains the most recently optimized - // version of the graph. - if (iteration > 0 && run_once_optimizers.count(optimizer->name())) { - continue; - } + // Some optimizers can run only once. + if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue; + uint64 start_us = Env::Default()->NowMicros(); // This swaps the current optimized_graph into optimized item and // resets optimized_graph to an empty graph. @@ -195,41 +198,53 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph = GraphDef(); Status status = optimizer->Optimize(cluster, optimized_item, optimized_graph); - uint64 end_us = Env::Default()->NowMicros(); - float duration_ms = (end_us - start_us) / 1000.0f; + string result; if (!status.ok()) { - VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": " - << status.ToString(); optimized_graph->Swap(&optimized_item.graph); result = status.ToString(); } else { - already_optimized = true; + is_optimized = true; + float duration_ms = (end_us - start_us) / 1000.0f; result = strings::StrCat( - optimizer->name(), ": ", PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph), ", time = ", duration_ms, "ms."); } - result_.emplace_back(optimizer->name(), result); - VLOG(1) << result; + VLOG(4) << optimizer->name() << ": " << result; + + OptimizerResult optimizer_result{optimizer->name(), result}; + optimization_result.results.push_back(optimizer_result); } } - if (already_optimized) { + // Record graph optimization result. + optimization_results_.push_back(optimization_result); + + if (is_optimized) { TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph)); ReassignColocation(optimized_graph); // Make sure that the optimizers preserved the graph version. DCHECK_EQ(optimized_graph->versions().producer(), item.graph.versions().producer()); } + + return Status::OK(); +} + +Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph) { + optimization_results_.clear(); + TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph)); return Status::OK(); } void MetaOptimizer::PrintResult() { - for (const auto& result : result_) { - LOG(INFO) << "Return status of optimizer " << result.first << ": " - << result.second; + for (const GraphOptimizationResult& graph_result : optimization_results_) { + LOG(INFO) << "Optimization results for grappler item: " << graph_result.id; + for (const OptimizerResult& result : graph_result.results) { + LOG(INFO) << " " << result.optimizer_name << ": " << result.result; + } } } diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h index 382cfe51d42..b8d46662489 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.h +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h @@ -30,7 +30,7 @@ class MetaOptimizer : public GraphOptimizer { public: MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg) : cpu_device_(cpu_device), cfg_(cfg) {} - ~MetaOptimizer() override {} + ~MetaOptimizer() override = default; string name() const override { return "meta_optimizer"; }; @@ -43,10 +43,36 @@ class MetaOptimizer : public GraphOptimizer { const GraphDef& optimized_graph, double result) override; private: - std::unique_ptr NewOptimizer(const string& optimizer); + std::unique_ptr MakeNewOptimizer( + const string& optimizer) const; + + // Initialize active optimizers from RewriterConfig toggles. + Status InitializeOptimizers( + std::vector>* optimizers) const; + // Initialize active optimizers from RewriterConfig optimizer names. + Status InitializeOptimizersByName( + std::vector>* optimizers) const; + + // Run optimization pass over a single GrapplerItem. Meta optimizer might run + // multiple such passes: 1) for the main graph 2) for the function library + Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item, + GraphDef* optimized_graph); + DeviceBase* const cpu_device_; // may be NULL RewriterConfig cfg_; - std::vector> result_; + + struct OptimizerResult { + string optimizer_name; + string result; + }; + + struct GraphOptimizationResult { + explicit GraphOptimizationResult(const string& id) : id(id) {} + string id; + std::vector results; + }; + + std::vector optimization_results_; }; bool MetaOptimizerEnabled(const RewriterConfig& cfg); From 33ffc8e7ff5090b92951c7faac150042dd814085 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 13:08:51 -0700 Subject: [PATCH 0679/1734] embedding_lookup_sparse documentation change. Remove "(typically from FeatureValueToId)" from args descriptions. This appears to have been an obsolete reference from an ancestor implementation. PiperOrigin-RevId: 194133212 --- tensorflow/python/ops/embedding_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py index 9e46739bc1b..6f2a34c731c 100644 --- a/tensorflow/python/ops/embedding_ops.py +++ b/tensorflow/python/ops/embedding_ops.py @@ -331,8 +331,8 @@ def embedding_lookup_sparse(params, representing sharded embedding tensors. Alternatively, a `PartitionedVariable`, created by partitioning along dimension 0. Each element must be appropriately sized for the given `partition_strategy`. - sp_ids: N x M `SparseTensor` of int64 ids (typically from FeatureValueToId), - where N is typically batch size and M is arbitrary. + sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size + and M is arbitrary. sp_weights: either a `SparseTensor` of float / double weights, or `None` to indicate all weights should be taken to be 1. If specified, `sp_weights` must have exactly the same shape and indices as `sp_ids`. From 893aa776009418c841d49c924207f3cdaf1d5174 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Tue, 24 Apr 2018 13:13:18 -0700 Subject: [PATCH 0680/1734] Fixing concurrency issues in RPC factory. PiperOrigin-RevId: 194133903 --- .../contrib/rpc/python/kernel_tests/BUILD | 1 - .../rpc/python/kernel_tests/rpc_op_test.py | 1 + .../python/kernel_tests/rpc_op_test_base.py | 62 ++++--- .../rpc/grpc_rpc_factory.cc | 135 +++++++------- .../rpc/grpc_rpc_factory.h | 18 ++ tensorflow/core/util/rpc/call_container.h | 165 +++++++++++++----- tensorflow/core/util/rpc/rpc_factory.h | 5 +- 7 files changed, 252 insertions(+), 135 deletions(-) diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD index f3e6731213f..2311c15a68c 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD +++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD @@ -28,7 +28,6 @@ py_library( py_library( name = "rpc_op_test_base", srcs = ["rpc_op_test_base.py"], - tags = ["notsan"], deps = [ ":test_example_proto_py", "//tensorflow/contrib/proto", diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py index e2e0dbc7a22..3fc6bfbb4d0 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py @@ -35,6 +35,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase): _protocol = 'grpc' invalid_method_string = 'Method not found' + connect_failed_string = 'Connect Failed' def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(RpcOpTest, self).__init__(methodName) diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py index 89f3ee1a1c5..27273d16b1c 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py @@ -93,40 +93,39 @@ class RpcOpTestBase(object): response_values = sess.run(response_tensors) self.assertAllEqual(response_values.shape, [0]) - def testInvalidAddresses(self): + def testInvalidMethod(self): + for method in [ + '/InvalidService.IncrementTestShapes', + self.get_method_name('InvalidMethodName') + ]: + with self.test_session() as sess: + with self.assertRaisesOpError(self.invalid_method_string): + sess.run(self.rpc(method=method, address=self._address, request='')) + + _, status_code_value, status_message_value = sess.run( + self.try_rpc(method=method, address=self._address, request='')) + self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertTrue( + self.invalid_method_string in status_message_value.decode('ascii')) + + def testInvalidAddress(self): + # This covers the case of address='' and address='localhost:293874293874' + address = 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@' with self.test_session() as sess: - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method='/InvalidService.IncrementTestShapes', - address=self._address, - request='')) - - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, - request='')) - - # This also covers the case of address='' - # and address='localhost:293874293874' with self.assertRaises(errors.UnavailableError): sess.run( self.rpc( method=self.get_method_name('IncrementTestShapes'), - address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@', + address=address, request='')) - - # Test invalid method with the TryRpc op _, status_code_value, status_message_value = sess.run( self.try_rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, + method=self.get_method_name('IncrementTestShapes'), + address=address, request='')) - self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertEqual(errors.UNAVAILABLE, status_code_value) self.assertTrue( - self.invalid_method_string in status_message_value.decode('ascii')) + self.connect_failed_string in status_message_value.decode('ascii')) def testAlwaysFailingMethod(self): with self.test_session() as sess: @@ -138,6 +137,18 @@ class RpcOpTestBase(object): with self.assertRaisesOpError(I_WARNED_YOU): sess.run(response_tensors) + response_tensors, status_code, status_message = self.try_rpc( + method=self.get_method_name('AlwaysFailWithInvalidArgument'), + address=self._address, + request='') + self.assertEqual(response_tensors.shape, ()) + self.assertEqual(status_code.shape, ()) + self.assertEqual(status_message.shape, ()) + status_code_value, status_message_value = sess.run((status_code, + status_message)) + self.assertEqual(errors.INVALID_ARGUMENT, status_code_value) + self.assertTrue(I_WARNED_YOU in status_message_value.decode('ascii')) + def testSometimesFailingMethodWithManyRequests(self): with self.test_session() as sess: # Fail hard by default. @@ -197,8 +208,7 @@ class RpcOpTestBase(object): address=self._address, request=request_tensors) for _ in range(10) ] - # Launch parallel 10 calls to the RpcOp, each containing - # 20 rpc requests. + # Launch parallel 10 calls to the RpcOp, each containing 20 rpc requests. many_response_values = sess.run(many_response_tensors) self.assertEqual(10, len(many_response_values)) for response_values in many_response_values: diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc index d004abd1c18..cde6b785dc6 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc @@ -30,7 +30,7 @@ limitations under the License. namespace tensorflow { -namespace { +namespace internal { class GrpcCall { public: explicit GrpcCall(CallContainer* container, int index, bool try_rpc, @@ -57,9 +57,10 @@ class GrpcCall { container_->Done(s, index_); } + CallOptions* call_opts() { return &call_opts_; } + int index() { return index_; } const string& request() const { return *request_msg_; } string* response() const { return response_msg_; } - CallOptions* call_opts() { return &call_opts_; } private: CallContainer* const container_; @@ -72,7 +73,9 @@ class GrpcCall { string* status_message_; }; -} // namespace +} // namespace internal + +using internal::GrpcCall; GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast, int64 timeout_in_ms) @@ -110,28 +113,6 @@ void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements, Tensor* response_t, Tensor* status_code_t, Tensor* status_message_t, AsyncOpKernel::DoneCallback done) { - auto address = address_t.flat(); - auto method = method_t.flat(); - auto request = request_t.flat(); - - // Stubs are maintained by the GrpcRPCFactory class and will be - // deleted when the class is destroyed. - ::grpc::GenericStub* singleton_stub = nullptr; - if (address.size() == 1) { - singleton_stub = GetOrCreateStubForAddress(address(0)); - } - auto get_stub = [&address, this, - singleton_stub](int64 ix) -> ::grpc::GenericStub* { - return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix)) - : singleton_stub; - }; - auto get_method_ptr = [&method](int64 ix) -> const string* { - return (method.size() > 1) ? &(method(ix)) : &(method(0)); - }; - auto get_request_ptr = [&request](int64 ix) -> const string* { - return (request.size() > 1) ? &(request(ix)) : &(request(0)); - }; - if (try_rpc) { // In this case status_code will never be set in the response, // so we just set it to OK. @@ -140,49 +121,22 @@ void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements, static_cast(errors::Code::OK)); } - CancellationManager* cm = ctx->cancellation_manager(); - CancellationToken cancellation_token = cm->get_cancellation_token(); + CallContainer::CreateCallFn create_call_fn = + [this, &request_t, &try_rpc, response_t, status_code_t, status_message_t]( + CallContainer* container, int index) { + CreateCall(request_t, try_rpc, index, container, response_t, + status_code_t, status_message_t); + }; + + CallContainer::StartCallFn start_call_fn = + [this, &address_t, &method_t](GrpcCall* call) { + StartCall(address_t, method_t, call); + }; // This object will delete itself when done. - auto* container = - new CallContainer(ctx, num_elements, fail_fast_, try_rpc, - std::move(done), cancellation_token); - - auto response = response_t->flat(); - int32* status_code_ptr = nullptr; - string* status_message_ptr = nullptr; - if (try_rpc) { - status_code_ptr = status_code_t->flat().data(); - status_message_ptr = status_message_t->flat().data(); - } - for (int i = 0; i < num_elements; ++i) { - container->calls()->emplace_back( - container, i, try_rpc, get_request_ptr(i), &response(i), - (try_rpc) ? &status_code_ptr[i] : nullptr, - (try_rpc) ? &status_message_ptr[i] : nullptr); - } - - int i = 0; - for (GrpcCall& call : *(container->calls())) { - // This object will delete itself when done. - new RPCState(get_stub(i), &completion_queue_, *get_method_ptr(i), - call.request(), call.response(), - /*done=*/[&call](const Status& s) { call.Done(s); }, - call.call_opts(), fail_fast_, timeout_in_ms_); - ++i; - } - - // Need to register this callback after all the RPCs are in - // flight; otherwise we may try to cancel an RPC *before* it - // launches, which is a no-op, and then fall into a deadlock. - bool is_cancelled = !cm->RegisterCallback( - cancellation_token, [container]() { container->StartCancel(); }); - - if (is_cancelled) { - ctx->SetStatus(errors::Cancelled("Operation has been cancelled.")); - // container's reference counter will take care of calling done(). - container->StartCancel(); - } + new CallContainer(ctx, num_elements, fail_fast_, try_rpc, + std::move(done), std::move(create_call_fn), + std::move(start_call_fn)); } ::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress( @@ -210,4 +164,53 @@ GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress( /*target=*/address, ::grpc::InsecureChannelCredentials(), args); } +void GrpcRPCFactory::CreateCall(const Tensor& request_t, const bool try_rpc, + int index, CallContainer* container, + Tensor* response_t, Tensor* status_code_t, + Tensor* status_message_t) { + auto request = request_t.flat(); + auto get_request_ptr = [&request](int64 ix) -> const string* { + return (request.size() > 1) ? &(request(ix)) : &(request(0)); + }; + auto response = response_t->flat(); + int32* status_code_ptr = nullptr; + string* status_message_ptr = nullptr; + if (try_rpc) { + status_code_ptr = status_code_t->flat().data(); + status_message_ptr = status_message_t->flat().data(); + } + container->RegisterCall(container, index, try_rpc, get_request_ptr(index), + &response(index), + (try_rpc) ? &status_code_ptr[index] : nullptr, + (try_rpc) ? &status_message_ptr[index] : nullptr); +} + +void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t, + GrpcCall* call) { + auto address = address_t.flat(); + auto method = method_t.flat(); + // Stubs are maintained by the GrpcRPCFactory class and will be + // deleted when the class is destroyed. + ::grpc::GenericStub* singleton_stub = nullptr; + if (address.size() == 1) { + singleton_stub = GetOrCreateStubForAddress(address(0)); + } + auto get_stub = [&address, this, + singleton_stub](int64 ix) -> ::grpc::GenericStub* { + return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix)) + : singleton_stub; + }; + auto get_method_ptr = [&method](int64 ix) -> const string* { + return (method.size() > 1) ? &(method(ix)) : &(method(0)); + }; + + int index = call->index(); + // This object will delete itself when done. + new RPCState(get_stub(index), &completion_queue_, + *get_method_ptr(index), call->request(), + call->response(), + /*done=*/[call](const Status& s) { call->Done(s); }, + call->call_opts(), fail_fast_, timeout_in_ms_); +} + } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h index 34ec235aafc..29394c84b55 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h @@ -20,10 +20,16 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/util/rpc/call_container.h" #include "tensorflow/core/util/rpc/rpc_factory.h" namespace tensorflow { +// Forward declaration of GrpcCall. +namespace internal { +class GrpcCall; +} // namespace internal + class GrpcRPCFactory : public RPCFactory { public: explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast, @@ -42,6 +48,18 @@ class GrpcRPCFactory : public RPCFactory { virtual ChannelPtr CreateChannelForAddress(const string& address); private: + // Creates a call and registers it with given `container`. The `index` is used + // to index into the tensor arguments. + void CreateCall(const Tensor& request_t, const bool try_rpc, int index, + CallContainer* container, + Tensor* response_t, Tensor* status_code_t, + Tensor* status_message_t); + + // Asynchronously invokes the given `call`. The call completion is handled + // by the call container the call was previously registered with. + void StartCall(const Tensor& address_t, const Tensor& method_t, + internal::GrpcCall* call); + ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address); bool fail_fast_; diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h index 7f360567975..e1226a7f162 100644 --- a/tensorflow/core/util/rpc/call_container.h +++ b/tensorflow/core/util/rpc/call_container.h @@ -26,53 +26,60 @@ limitations under the License. namespace tensorflow { -template +namespace internal { +// The following class is used for coordination between a `CallContainer` +// instance and a cancellation callback to make sure that the `CallContainer` +// instance waits for the cancellation callback to be destroyed (either because +// a cancellation occurred or because the callback was deregistered) before +// deleting itself. Without this coordination the cancellation callback could +// attempt to access a `CallContainer` instance that is no longer valid. +class NotifyWhenDestroyed { + public: + explicit NotifyWhenDestroyed(std::shared_ptr notification) + : notification_(std::move(notification)) {} + + ~NotifyWhenDestroyed() { notification_->Notify(); } + + private: + std::shared_ptr notification_; +}; +} // namespace internal + +// The following class is responsible for the life cycle management of a set of +// RPC calls. The calls are started when an instance of the class is created and +// the class contract guarantees to invoke a "done" callback provided by the +// caller when all RPC calls have either completed or been cancelled. +// +// The caller should not make any assumptions about the validity of an instance +// of this class after the provided callback has been invoked, which may be +// immediately after the instance was created. +template class CallContainer { public: + typedef std::function*, int)> CreateCallFn; + typedef std::function StartCallFn; + + // Uses the provided `create_call_fn` and `start_call_fn` functions to create + // and start a set of RPC calls. When all RPC calls have either completed or + // been cancelled, the `done` callback is invoked. The caller should not make + // any assumptions about the validity of the created instance as the instance + // will delete itself after invoking the `done` callback. explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast, bool try_rpc, AsyncOpKernel::DoneCallback done, - CancellationToken token) - : ctx_(ctx), - done_(std::move(done)), - token_(token), - fail_fast_(fail_fast), - try_rpc_(try_rpc) { - CHECK_GT(num_calls, 0); + CreateCallFn create_call_fn, + StartCallFn start_call_fn); - // This will run when all RPCs are finished. - reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) { - ctx_->cancellation_manager()->DeregisterCallback(token_); - ctx_->SetStatus(s); - done_(); - delete this; - }); + // Registers a call with this container. This method expects its arguments to + // match those of a `Call` constructor as it forwards them to an underlying + // collection, which creates a `Call` instance in place. + template + void RegisterCall(Args&&... args); - // Subtract reference count from the initial creation. - core::ScopedUnref unref(reffed_status_callback_); + // Starts the cancellation of all RPC calls managed by this container. + void StartCancel(); - for (int i = 0; i < num_calls; ++i) { - // Increase the reference on the callback for each new RPC. - reffed_status_callback_->Ref(); - } - } - - std::list* calls() { return &calls_; } - - void StartCancel() { - // Once this loop is done, can no longer assume anything is valid - // because "delete this" may have been immediately called. - // Nothing should run after this loop. - for (auto& call : calls_) { - call.StartCancel(); - } - } - - void Done(const Status& s, int index) { - if (!try_rpc_) { - reffed_status_callback_->UpdateStatus(s); - } - reffed_status_callback_->Unref(); - } + // Indicates that the `index`-th RPC call has finished. + void Done(const Status& s, int index); private: OpKernelContext* ctx_; @@ -81,10 +88,88 @@ class CallContainer { const CancellationToken token_; const bool fail_fast_; const bool try_rpc_; + std::shared_ptr callback_destroyed_; // Performs its own reference counting. ReffedStatusCallback* reffed_status_callback_; }; +template +CallContainer::CallContainer( + OpKernelContext* ctx, int num_calls, bool fail_fast, bool try_rpc, + AsyncOpKernel::DoneCallback done, + typename CallContainer::CreateCallFn create_call_fn, + typename CallContainer::StartCallFn start_call_fn) + : ctx_(ctx), + done_(std::move(done)), + token_(ctx->cancellation_manager()->get_cancellation_token()), + fail_fast_(fail_fast), + try_rpc_(try_rpc), + callback_destroyed_(new Notification) { + CHECK_GT(num_calls, 0); + + // This will run when all RPCs are finished. + reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) { + ctx_->cancellation_manager()->DeregisterCallback(token_); + ctx_->SetStatus(s); + done_(); + callback_destroyed_->WaitForNotification(); + delete this; + }); + + // The cancellation callback needs to be registered before the RPC calls are + // started to make sure that the callback is properly cleaned up by the + // `reffed_status_callback` when all calls complete. At the same time, the + // cancellation callback should wait for the RPC calls to be started for the + // cancellation to take effect. + std::shared_ptr notify_when_destroyed( + new internal::NotifyWhenDestroyed(callback_destroyed_)); + std::shared_ptr calls_started(new Notification); + bool is_cancelled = !ctx_->cancellation_manager()->RegisterCallback( + token_, [this, calls_started, notify_when_destroyed]() { + calls_started->WaitForNotification(); + StartCancel(); + }); + + for (int i = 0; i < num_calls; ++i) { + create_call_fn(this, i); + // Increase the reference on the callback for each new RPC. + reffed_status_callback_->Ref(); + } + for (Call& call : calls_) { + start_call_fn(&call); + } + calls_started->Notify(); + + if (is_cancelled) { + ctx_->SetStatus(errors::Cancelled("Operation has been cancelled.")); + StartCancel(); + } + + // Subtract reference count from the initial creation. + reffed_status_callback_->Unref(); +} + +template +template +void CallContainer::RegisterCall(Args&&... args) { + calls_.emplace_back(std::forward(args)...); +} + +template +void CallContainer::StartCancel() { + for (auto& call : calls_) { + call.StartCancel(); + } +} + +template +void CallContainer::Done(const Status& s, int index) { + if (!try_rpc_) { + reffed_status_callback_->UpdateStatus(s); + } + reffed_status_callback_->Unref(); +} + } // namespace tensorflow #endif // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_ diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h index 9bf078c0f4a..c4eaaf44570 100644 --- a/tensorflow/core/util/rpc/rpc_factory.h +++ b/tensorflow/core/util/rpc/rpc_factory.h @@ -32,10 +32,11 @@ class RPCFactory { RPCFactory() {} virtual ~RPCFactory() {} - // Start a Call() to methods `method_t` at addresses `address_t` with + // Asynchronously invokes methods `method_t` at addresses `address_t` with // request strings from `request_t`. Any of these may be scalar // Tensors, in which case the operands are broadcasted. - // Upon completion of all requests, `response_t` will be populated. + // Upon completion of all requests, `response_t` will be populated and the + // `done` callback will be invoked. // // If `try_rpc` is `true`, then `status_message_t` and // `status_code_t` will be populated as well. From 4355b923c273a4e07655f860a95428b2db977741 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 13:21:49 -0700 Subject: [PATCH 0681/1734] Implement hoisting of common prefix of unary ops to concat. PiperOrigin-RevId: 194135148 --- tensorflow/core/grappler/op_types.cc | 113 ++++++++--- tensorflow/core/grappler/op_types.h | 2 + .../optimizers/arithmetic_optimizer.cc | 187 +++++++++++++++++- .../optimizers/arithmetic_optimizer.h | 5 + .../optimizers/arithmetic_optimizer_test.cc | 102 ++++++++++ 5 files changed, 378 insertions(+), 31 deletions(-) diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc index 9c45aed62ff..f595cf64563 100644 --- a/tensorflow/core/grappler/op_types.cc +++ b/tensorflow/core/grappler/op_types.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" namespace tensorflow { namespace grappler { @@ -451,43 +452,101 @@ OPDEF_PROPERTY_HELPER(Aggregate, aggregate) OPDEF_PROPERTY_HELPER(Commutative, commutative) bool IsInvolution(const NodeDef& node) { - const std::unordered_set involution_ops{ - "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"}; - return involution_ops.count(node.op()) > 0; + static const std::unordered_set* involution_ops = + CHECK_NOTNULL((new std::unordered_set{ + "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"})); + return involution_ops->count(node.op()) > 0; } bool IsValueAndOrderPreserving(const NodeDef& node) { if (NumNonControlInputs(node) == 1 && IsAggregate(node)) { return true; } - const std::unordered_set value_and_order_preserving_ops{ - "CheckNumerics", - "DebugGradientIdentity", - "DeepCopy" - "Enter", - "Exit", - "ExpandDims", - "Identity", - "IdentityN", - "PreventGradient", - "Print", - "Reshape", - "Snapshot", - "Squeeze", - "StopGradient", - }; - return value_and_order_preserving_ops.count(node.op()) > 0; + static const std::unordered_set* value_and_order_preserving_ops = + CHECK_NOTNULL((new const std::unordered_set{ + "CheckNumerics", + "DebugGradientIdentity", + "DeepCopy" + "Enter", + "Exit", + "ExpandDims", + "Identity", + "IdentityN", + "PreventGradient", + "Print", + "Reshape", + "Snapshot", + "Squeeze", + "StopGradient", + })); + return value_and_order_preserving_ops->count(node.op()) > 0; } bool IsValuePreserving(const NodeDef& node) { - const std::unordered_set value_preserving_ops{ - "InvertPermutation", - "Reverse", - "Roll", - "Transpose", - }; + static const std::unordered_set* value_preserving_ops = + CHECK_NOTNULL((new std::unordered_set{ + "InvertPermutation", + "Reverse", + "Roll", + "Transpose", + })); return IsValueAndOrderPreserving(node) || - value_preserving_ops.count(node.op()) > 0; + value_preserving_ops->count(node.op()) > 0; +} + +bool IsUnaryElementWise(const NodeDef& node) { + static const std::unordered_set* element_wise_ops = + CHECK_NOTNULL((new std::unordered_set{ + "Abs", + "Acos", + "Acosh", + "Asin", + "Asinh", + "Atan", + "Atan2", + "Atanh", + "Ceil", + "ComplexAbs", + "Conj", + "Cos", + "Cosh", + "Digamma", + "Elu" + "Erf", + "Erfc", + "Exp", + "Expm1", + "Floor", + "Inv", + "Invert", + "Isinf", + "Isnan", + "Isfinite", + "Lgamma", + "Log", + "Log1p", + "LogicalNot", + "Neg", + "Reciprocal", + "Relu", + "Relu6", + "Rint", + "Round", + "Selu", + "Rsqrt", + "Sigmoid", + "Sign", + "Sin", + "SinH", + "Softplus", + "Softsign", + "Sqrt", + "Square", + "Tan" + "Tanh", + })); + return element_wise_ops->count(node.op()) > 0 || + (!IsIdentityN(node) && IsValueAndOrderPreserving(node)); } bool HasOpDef(const NodeDef& node) { diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h index 79fd05e1870..7f5da19d905 100644 --- a/tensorflow/core/grappler/op_types.h +++ b/tensorflow/core/grappler/op_types.h @@ -177,6 +177,8 @@ bool IsValueAndOrderPreserving(const NodeDef& node); // function returns true if the op commutes with all element-wise operations. bool IsValuePreserving(const NodeDef& node); +bool IsUnaryElementWise(const NodeDef& node); + // Returns true if we can find an opdef corresponding to the op of the node. bool HasOpDef(const NodeDef& node); diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc index ed199c1ac8b..866b993e938 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc @@ -1340,6 +1340,182 @@ class RemoveNegationStage : public ArithmeticOptimizerStage { } }; +// This optimization hoists the common prefix of unary ops of the inputs to +// concat out of the concat. +// For example: Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))]) -> +// Exp(Sin(Concat([x, y, z]))). +// TODO(rmlarsen): Support casting. We would have to change the type attribute +// on the concat node. +class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage { + public: + explicit HoistCWiseUnaryFromConcatStage( + const GraphOptimizerContext& ctx, + const ArithmeticOptimizerContext& ctx_ext) + : ArithmeticOptimizerStage("", ctx, ctx_ext) {} + + ~HoistCWiseUnaryFromConcatStage() override = default; + + bool IsSupported(const NodeDef* node) const override { + if (!IsConcat(*node)) return false; + const int n = node->attr().at("N").i(); + return n > 1; + } + + Status TrySimplify(NodeDef* concat_node, + string* simplified_node_name) override { + int prefix_length; + std::set ctrl_inputs; + TF_RETURN_IF_ERROR( + FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs)); + if (prefix_length > 0) { + TF_RETURN_IF_ERROR( + HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node)); + AddToOptimizationQueue(concat_node); + } + return Status::OK(); + } + + private: + void RemoveControlInputs(std::set* removed_ctrl_inputs, + NodeDef* node) const { + const int num_inputs = node->input_size(); + for (int idx = num_inputs - 1; idx >= 0; --idx) { + const string& input = node->input(idx); + if (IsControlInput(input)) { + removed_ctrl_inputs->insert(input); + ctx().node_map->RemoveOutput(NodeName(input), node->name()); + node->mutable_input()->RemoveLast(); + } else { + break; + } + } + } + + void AddControlInputs(std::set* new_ctrl_inputs, + NodeDef* node) const { + for (int idx = node->input_size() - 1; idx >= 0; --idx) { + const string& existing_input = node->input(idx); + if (IsControlInput(existing_input)) { + new_ctrl_inputs->erase(existing_input); + } else { + break; + } + } + for (const string& new_input : *new_ctrl_inputs) { + ctx().node_map->AddOutput(NodeName(new_input), node->name()); + node->add_input(new_input); + } + } + + // Returns the length of the common unary prefix chain of ops that can be + // hoisted out of concat. + Status FindCommonUnaryOpPrefix(const NodeDef& concat_node, int* prefix_length, + std::set* ctrl_inputs) const { + *prefix_length = 0; + const int n = concat_node.attr().at("N").i(); + // Follow the chains backwards from each concat input as long as all the + // following conditions hold: + // 1. The ops in all chains are the same. + // 2. The op is a unary elemenwise op. + // 3. The op output has only a single consumer. + std::vector tail(n, nullptr); + const int start = concat_node.op() == "Concat" ? 1 : 0; + const int end = start + n; + // Set up tail pointers to point to the immediate inputs to Concat. + for (int i = start; i < end; ++i) { + if (IsControlInput(concat_node.input(i))) { + return errors::FailedPrecondition("Got control input ", + concat_node.input(i), + " where normal input was expected."); + } + TF_RETURN_IF_ERROR(GetInputNode(concat_node.input(i), &tail[i - start])); + } + + bool stop = false; + ctrl_inputs->clear(); + while (!stop) { + const NodeDef* tail0 = tail[0]; + if (!IsUnaryElementWise(*tail0)) break; + for (int chain = 0; chain < n; ++chain) { + // TODO(rmlarsen): Allow and hoist outgoing control edges. + if (tail[chain]->op() != tail0->op() || + ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) { + stop = true; + break; + } + } + if (stop) break; + // We found one more op that can be hoisted. + ++(*prefix_length); + for (int chain = 0; chain < n; ++chain) { + RemoveControlInputs(ctrl_inputs, tail[chain]); + } + // Advance tail pointers to the next level. + for (int chain = 0; chain < n; ++chain) { + if (tail[chain]->input_size() == 0 || + IsControlInput(tail[chain]->input(0))) { + stop = true; + break; + } else { + NodeDef* new_tail = nullptr; + TF_RETURN_IF_ERROR(GetInputNode(tail[chain]->input(0), &new_tail)); + tail[chain] = new_tail; + } + } + } + return Status::OK(); + } + + Status HoistUnaryOpPrefix(const int prefix_length, + std::set* ctrl_inputs, + NodeDef* concat_node) { + const int n = concat_node->attr().at("N").i(); + const int start = concat_node->op() == "Concat" ? 1 : 0; + const int end = start + n; + const std::set consumers = + ctx().node_map->GetOutputs(concat_node->name()); + AddControlInputs(ctrl_inputs, concat_node); + for (int chain = 0; chain < (end - start); ++chain) { + NodeDef* tail = nullptr; + const string concat_input = concat_node->input(chain + start); + for (int distance = 0; distance < prefix_length; ++distance) { + if (distance == 0) { + TF_RETURN_IF_ERROR(GetInputNode(concat_input, &tail)); + } else { + TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &tail)); + } + } + + // Hook the node following tail directly into the concat node. + const string tail_input = tail->input(0); + concat_node->set_input(chain + start, tail_input); + ctx().node_map->UpdateInput(concat_node->name(), concat_input, + tail_input); + + if (chain == 0) { + // Reuse nodes in the first chain to process output of concat. + tail->set_input(0, concat_node->name()); + ctx().node_map->UpdateInput(tail->name(), tail_input, + concat_node->name()); + + // Update the consumers of concat to consume the end of the chain + // instead. + for (NodeDef* consumer : consumers) { + for (int idx = 0; idx < consumer->input_size(); ++idx) { + if (consumer->input(idx) == concat_node->name()) { + consumer->set_input(idx, concat_input); + ctx().node_map->UpdateInput(consumer->name(), concat_node->name(), + concat_input); + } + } + AddToOptimizationQueue(consumer); + } + } + } + return Status::OK(); + } +}; + } // namespace class UniqueNodes { @@ -1995,6 +2171,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) { pipeline.AddStage(ctx, ctx_ext); if (options_.remove_negation) pipeline.AddStage(ctx, ctx_ext); + if (options_.hoist_unary_out_of_concat) + pipeline.AddStage(ctx, ctx_ext); VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: " << str_util::Join(pipeline.StageNames(), ", "); @@ -2062,17 +2240,18 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/, nodes_to_preserve_ = item.NodesToPreserve(); fetch_nodes_known_ = !item.fetch.empty(); *optimized_graph = item.graph; - optimized_graph_ = optimized_graph; + GrapplerItem optimized_item(item, optimized_graph); + optimized_graph_ = &optimized_item.graph; node_map_.reset(new NodeMap(optimized_graph_)); - DedupComputations(); + if (options_.dedup_computations) { + DedupComputations(); + } // Perform topological sort on the graph in order to help AddOpsRewrite to // optimize larger subgraphs starting from the roots with more inputs. TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_)); - GrapplerItem optimized_item(item, optimized_graph); - optimized_graph_ = &optimized_item.graph; graph_properties_.reset(new GraphProperties(optimized_item)); const Status status = graph_properties_->InferStatically(false); const bool can_use_shapes = status.ok(); diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h index 344c8281eb1..375f13acc13 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h @@ -56,6 +56,7 @@ class ArithmeticOptimizer : public GraphOptimizer { struct ArithmeticOptimizerOptions { // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests. // Remove when all optimizers will be migrated to separate stages. + bool dedup_computations = true; bool enable_try_simplify_and_replace = true; bool combine_add_to_addn = true; bool hoist_common_factor_out_of_aggregation = true; @@ -64,12 +65,16 @@ class ArithmeticOptimizer : public GraphOptimizer { bool remove_redundant_bitcast = true; bool remove_redundant_cast = true; bool remove_negation = true; + bool hoist_unary_out_of_concat = false; // Choose which arithmetic optimizer stages will be enabled for a given // optimization level by default. static ArithmeticOptimizerOptions Default( RewriterConfig::Toggle opt_level) { ArithmeticOptimizerOptions options; + if (opt_level == RewriterConfig::AGGRESSIVE) { + options.hoist_unary_out_of_concat = true; + } return options; } }; diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc index cb1f2ea732c..df10dbdf48f 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc @@ -98,6 +98,7 @@ class ArithmeticOptimizerTest : public GrapplerTest { // should explicitly enable required optimization for tests isolation void DisableAllStages(ArithmeticOptimizer* optimizer) { ArithmeticOptimizer::ArithmeticOptimizerOptions options; + options.dedup_computations = false; options.enable_try_simplify_and_replace = false; options.combine_add_to_addn = false; options.hoist_common_factor_out_of_aggregation = false; @@ -147,6 +148,10 @@ class ArithmeticOptimizerTest : public GrapplerTest { DisableAllStages(optimizer); optimizer->options_.remove_negation = true; } + void EnableOnlyHoistCWiseUnaryFromConcat(ArithmeticOptimizer* optimizer) { + DisableAllStages(optimizer); + optimizer->options_.hoist_unary_out_of_concat = true; + } }; TEST_F(ArithmeticOptimizerTest, NoOp) { @@ -2086,5 +2091,102 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) { EXPECT_EQ("mul1", mul3_node->input(1)); } +TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) { + tensorflow::Scope s = tensorflow::Scope::NewRootScope(); + Output a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT); + Output b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT); + Output c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT); + Output axis = ops::Const(s.WithOpName("axis"), 0, {}); + Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {}); + Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {}); + Output ctrl3 = ops::Const(s.WithOpName("ctrl3"), 3, {}); + // Test case with chains of length 1. + Output sin_a = + ops::Sin(s.WithOpName("sin_a").WithControlDependencies(ctrl3), a); + Output exp_a = + ops::Exp(s.WithOpName("exp_a").WithControlDependencies(ctrl1), sin_a); + Output exp_b = ops::Exp(s.WithOpName("exp_b"), b); + Output exp_c = + ops::Exp(s.WithOpName("exp_c").WithControlDependencies(ctrl2), c); + Output concat = + ops::Concat(s.WithOpName("concat"), {exp_a, exp_b, exp_c}, axis); + Output id = ops::Identity(s.WithOpName("id"), concat); + + // Test case with chains of length 2. + Output exp_a2 = + ops::Exp(s.WithOpName("exp_a2").WithControlDependencies(ctrl1), sin_a); + Output exp_b2 = ops::Exp(s.WithOpName("exp_b2"), b); + Output exp_c2 = + ops::Exp(s.WithOpName("exp_c2").WithControlDependencies(ctrl2), c); + Output cos_exp_a2 = ops::Cos( + s.WithOpName("cos_exp_a2").WithControlDependencies(ctrl1), exp_a2); + Output cos_exp_b2 = ops::Cos( + s.WithOpName("cos_exp_b2").WithControlDependencies(ctrl3), exp_b2); + Output cos_exp_c2 = ops::Cos(s.WithOpName("cos_exp_c2"), exp_c2); + Output concat2 = ops::Concat(s.WithOpName("concat2"), + {cos_exp_a2, cos_exp_b2, cos_exp_c2}, axis); + Output id2 = ops::Identity(s.WithOpName("id2"), concat2); + GrapplerItem item; + item.fetch = {"id", "id2"}; + TF_CHECK_OK(s.ToGraphDef(&item.graph)); + + GraphDef output; + ArithmeticOptimizer optimizer; + EnableOnlyHoistCWiseUnaryFromConcat(&optimizer); + + OptimizeAndPrune(&optimizer, &item, &output); + int found = 0; + for (const NodeDef& node : output.node()) { + if (node.name() == "concat") { + EXPECT_EQ(6, node.input_size()); + EXPECT_EQ("sin_a", node.input(0)); + EXPECT_EQ("b", node.input(1)); + EXPECT_EQ("c", node.input(2)); + EXPECT_EQ("axis", node.input(3)); + EXPECT_EQ("^ctrl1", node.input(4)); + EXPECT_EQ("^ctrl2", node.input(5)); + found++; + } + if (node.name() == "exp_a") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("concat", node.input(0)); + found++; + } + if (node.name() == "id") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("exp_a", node.input(0)); + found++; + } + + if (node.name() == "concat2") { + EXPECT_EQ(7, node.input_size()); + EXPECT_EQ("sin_a", node.input(0)); + EXPECT_EQ("b", node.input(1)); + EXPECT_EQ("c", node.input(2)); + EXPECT_EQ("axis", node.input(3)); + EXPECT_EQ("^ctrl1", node.input(4)); + EXPECT_EQ("^ctrl2", node.input(5)); + EXPECT_EQ("^ctrl3", node.input(6)); + found++; + } + if (node.name() == "exp_a2") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("concat2", node.input(0)); + found++; + } + if (node.name() == "cos_exp_a2") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("exp_a2", node.input(0)); + found++; + } + if (node.name() == "id2") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("cos_exp_a2", node.input(0)); + found++; + } + } + EXPECT_EQ(7, found); +} + } // namespace grappler } // namespace tensorflow From a3691c4af225126e14b0df1f30969899b33de243 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 13:47:35 -0700 Subject: [PATCH 0682/1734] - Add a way to specify custom updater args to updaters in the optimizer. - Create RegAdagradOptimizer which allows the user to specify whether a gradient update is allowed to update the slot vars. PiperOrigin-RevId: 194139121 --- tensorflow/contrib/opt/BUILD | 20 + .../python/training/reg_adagrad_optimizer.py | 107 ++++++ .../training/reg_adagrad_optimizer_test.py | 343 ++++++++++++++++++ 3 files changed, 470 insertions(+) create mode 100644 tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py create mode 100644 tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index 612ecc3e638..13aa1d7e7a1 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -25,6 +25,7 @@ py_library( "python/training/multitask_optimizer_wrapper.py", "python/training/nadam_optimizer.py", "python/training/powersign.py", + "python/training/reg_adagrad_optimizer.py", "python/training/sign_decay.py", "python/training/variable_clipping_optimizer.py", ], @@ -155,6 +156,25 @@ py_test( ], ) +py_test( + name = "reg_adagrad_optimizer_test", + srcs = ["python/training/reg_adagrad_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":opt_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:embedding_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//third_party/py/numpy", + ], +) + py_test( name = "nadam_optimizer_test", srcs = ["python/training/nadam_optimizer_test.py"], diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py new file mode 100644 index 00000000000..d0e0405a2c3 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py @@ -0,0 +1,107 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""RegAdagrad for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import math_ops +from tensorflow.python.training import adagrad +from tensorflow.python.training import training_ops +from tensorflow.python.util import tf_contextlib + + +class RegAdagradOptimizer(adagrad.AdagradOptimizer): + """RegAdagrad: Adagrad with updates that optionally skip updating the slots. + + This is meant to address the problem of additional regularization terms in the + loss function affecting learning rate decay and causing hyper-param + entanglement. Example usage: + + loss = tf.nn.cross_entropy(x, labels) + reg_loss = reg_strength * tf.reduce_sum(x * x) + opt = tf.contrib.opt.RegAdagradOptimizer(learning_rate) + loss_update = opt.minimize(loss) + with opt.avoid_updating_slots(): + reg_update = opt.minimize(reg_loss) + total_update = tf.group([loss_update, reg_update]) + + # ... + + sess.run(total_update, ...) + """ + + def __init__(self, + learning_rate, + initial_accumulator_value=0.1, + use_locking=False, + name="RegAdagrad"): + super(RegAdagradOptimizer, self).__init__( + learning_rate, + initial_accumulator_value=initial_accumulator_value, + use_locking=use_locking, + name=name) + self._should_update_slots = True + + @tf_contextlib.contextmanager + def avoid_updating_slots(self): + old = self._should_update_slots + self._should_update_slots = False + try: + yield + finally: + self._should_update_slots = old + + def _apply_dense(self, grad, var): + acc = self.get_slot(var, "accumulator") + return training_ops.apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_dense(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _apply_sparse(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.sparse_apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad.values, + grad.indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_sparse(self, grad, var, indices, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_sparse_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype), + grad, + indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py new file mode 100644 index 00000000000..ea56e1646a0 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py @@ -0,0 +1,343 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for Regreg_adagrad_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.opt.python.training import reg_adagrad_optimizer +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import embedding_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +class RegAdagradOptimizerTest(test.TestCase): + + def doTestBasic(self, use_locking=False, use_resource=False): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + if use_resource: + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) + else: + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1, use_locking=use_locking) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testBasic(self): + self.doTestBasic(use_locking=False) + + def testBasicResource(self): + self.doTestBasic(use_locking=False, use_resource=True) + + def testBasicLocked(self): + self.doTestBasic(use_locking=True) + + def testMinimizeSparseResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = resource_variable_ops.ResourceVariable( + [[1.0, 2.0], [3.0, 4.0]], dtype=dtype) + x = constant_op.constant([[4.0], [5.0]], dtype=dtype) + pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) + loss = pred * pred + sgd_op = reg_adagrad_optimizer.RegAdagradOptimizer(1.0).minimize(loss) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]], + var0.eval()) + # Run 1 step of sgd + sgd_op.run() + # Validate updated params + self.assertAllCloseAccordingToType( + [[0, 1], [3, 4]], var0.eval(), atol=0.01) + + def testTensorLearningRate(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + constant_op.constant(3.0), initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testSparseBasic(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([[-1.6026098728179932], [2.0]]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([[3.0], [3.715679168701172]]), var1.eval()) + + def testSparseRepeatedIndices(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + repeated_index_update_var = variables.Variable( + [[1.0], [2.0]], dtype=dtype) + aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype) + grad_repeated_index = ops.IndexedSlices( + constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), + constant_op.constant([1, 1]), constant_op.constant([2, 1])) + grad_aggregated = ops.IndexedSlices( + constant_op.constant([0.2], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + repeated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_repeated_index, + repeated_index_update_var)]) + aggregated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_aggregated, aggregated_update_var)]) + variables.global_variables_initializer().run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + for _ in range(3): + repeated_update.run() + aggregated_update.run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + + def testSparseRepeatedIndicesResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var_repeated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_repeated = math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_repeated, [0, 0])) + var_aggregated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_aggregated = 2 * math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_aggregated, [0])) + update_op_repeated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_repeated) + update_op_aggregated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_aggregated) + variables.global_variables_initializer().run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + for _ in range(3): + update_op_repeated.run() + update_op_aggregated.run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + + def testSparseStability(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + shape = [1, 6] + var0 = variables.Variable( + [[ + 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, + -0.0105945 + ]], + dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant( + [[ + -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, + -8.4877e-05, -9.48906e-05 + ]], + shape=shape, + dtype=dtype), constant_op.constant([0]), + constant_op.constant(shape)) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 1.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients(zip([grads0], [var0])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + init = variables.global_variables_initializer() + for _ in range(100): + init.run() + ada_update.run() + self.assertAllCloseAccordingToType( + np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[ + 0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573, + -0.01029443 + ]]), var0.eval()) + + def testSharing(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(3.0) + # Apply the optimizer twice. Both applications will use + # the same accums. + ada_update1 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + ada_update2 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + ada_update1.run() + ada_update2.run() + ada_update1.run() + # Validate updated params (the same as with only 1 RegAdagrad). + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testDynamicShapeVariable_Ok(self): + with self.test_session(): + v = variable_scope.get_variable( + "v", initializer=constant_op.constant(1.), validate_shape=False) + self.assertFalse(v.shape.is_fully_defined()) + # Creating optimizer should cause no exception. + reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + + def testSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + # Apply the optimizer twice. Both applications will use + # the same accums. + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot0.eval()) + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot1.eval()) + + def testSparseSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot1.eval()) + + +if __name__ == "__main__": + test.main() From dd9ee4a2f13c2219ebd7c6f8754b8dd32188e2a5 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 24 Apr 2018 10:59:10 -0700 Subject: [PATCH 0683/1734] Update README.md --- tensorflow/tools/docker/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md index f46c56e11aa..525f2995cee 100644 --- a/tensorflow/tools/docker/README.md +++ b/tensorflow/tools/docker/README.md @@ -16,12 +16,12 @@ quick links here: We currently maintain two Docker container images: -* `gcr.io/tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only! +* `tensorflow/tensorflow` - TensorFlow with all dependencies - CPU only! -* `gcr.io/tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies +* `tensorflow/tensorflow:latest-gpu` - TensorFlow with all dependencies and support for NVidia CUDA -Note: We also publish the same containers into +Note: We store all our containers on [Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow/tags/). @@ -29,12 +29,12 @@ Note: We also publish the same containers into Run non-GPU container using - $ docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow + $ docker run -it -p 8888:8888 tensorflow/tensorflow For GPU support install NVidia drivers (ideally latest) and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Run using - $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu + $ nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:latest-gpu Note: If you would have a problem running nvidia-docker you may try the old method @@ -44,7 +44,7 @@ it there and try using nvidia-docker as described above. $ # The old, not recommended way to run docker with gpu support: $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}') $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu + $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES tensorflow/tensorflow:latest-gpu ## More containers From e36ebcc88f0831c9fc16d0f5b060d076af8c0849 Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Tue, 24 Apr 2018 13:58:37 -0700 Subject: [PATCH 0684/1734] Revert #18251 due to the following issue: - calling convolution with args instead of kwargs from convolutionXd breaks when called within arg_scope. - intentional use cases trigger the added dimension error. PiperOrigin-RevId: 194140820 --- .../contrib/layers/python/layers/layers.py | 142 +----------------- .../layers/python/layers/layers_test.py | 15 +- 2 files changed, 7 insertions(+), 150 deletions(-) diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index 2f3e57653c5..25c3b1e7ea0 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -932,8 +932,7 @@ def convolution(inputs, variables_collections=None, outputs_collections=None, trainable=True, - scope=None, - conv_dims=None): + scope=None): """Adds an N-D convolution followed by an optional batch_norm layer. It is required that 1 <= N <= 3. @@ -994,10 +993,6 @@ def convolution(inputs, trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. - conv_dims: Optional convolution dimensionality, when set it would use the - corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When - leaved to None it would select the convolution dimensionality based on - the input rank (i.e. Conv ND, with N = input_rank - 2). Returns: A tensor representing the output of the operation. @@ -1020,9 +1015,6 @@ def convolution(inputs, inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims - if conv_dims is not None and conv_dims + 2 != input_rank: - raise ValueError('Convolution expects input with rank %d, got %d' % - (conv_dims + 2, input_rank)) if input_rank == 3: layer_class = convolutional_layers.Convolution1D elif input_rank == 4: @@ -1069,134 +1061,10 @@ def convolution(inputs, outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs) -@add_arg_scope -def convolution1d(inputs, - num_outputs, - kernel_size, - stride=1, - padding='SAME', - data_format=None, - rate=1, - activation_fn=nn.relu, - normalizer_fn=None, - normalizer_params=None, - weights_initializer=initializers.xavier_initializer(), - weights_regularizer=None, - biases_initializer=init_ops.zeros_initializer(), - biases_regularizer=None, - reuse=None, - variables_collections=None, - outputs_collections=None, - trainable=True, - scope=None): - return convolution(inputs, - num_outputs, - kernel_size, - stride, - padding, - data_format, - rate, - activation_fn, - normalizer_fn, - normalizer_params, - weights_initializer, - weights_regularizer, - biases_initializer, - biases_regularizer, - reuse, - variables_collections, - outputs_collections, - trainable, - scope, - conv_dims=1) -convolution1d.__doc__ = convolution.__doc__ +convolution2d = convolution +convolution3d = convolution -@add_arg_scope -def convolution2d(inputs, - num_outputs, - kernel_size, - stride=1, - padding='SAME', - data_format=None, - rate=1, - activation_fn=nn.relu, - normalizer_fn=None, - normalizer_params=None, - weights_initializer=initializers.xavier_initializer(), - weights_regularizer=None, - biases_initializer=init_ops.zeros_initializer(), - biases_regularizer=None, - reuse=None, - variables_collections=None, - outputs_collections=None, - trainable=True, - scope=None): - return convolution(inputs, - num_outputs, - kernel_size, - stride, - padding, - data_format, - rate, - activation_fn, - normalizer_fn, - normalizer_params, - weights_initializer, - weights_regularizer, - biases_initializer, - biases_regularizer, - reuse, - variables_collections, - outputs_collections, - trainable, - scope, - conv_dims=2) - -convolution2d.__doc__ = convolution.__doc__ - -@add_arg_scope -def convolution3d(inputs, - num_outputs, - kernel_size, - stride=1, - padding='SAME', - data_format=None, - rate=1, - activation_fn=nn.relu, - normalizer_fn=None, - normalizer_params=None, - weights_initializer=initializers.xavier_initializer(), - weights_regularizer=None, - biases_initializer=init_ops.zeros_initializer(), - biases_regularizer=None, - reuse=None, - variables_collections=None, - outputs_collections=None, - trainable=True, - scope=None): - return convolution(inputs, - num_outputs, - kernel_size, - stride, - padding, - data_format, - rate, - activation_fn, - normalizer_fn, - normalizer_params, - weights_initializer, - weights_regularizer, - biases_initializer, - biases_regularizer, - reuse, - variables_collections, - outputs_collections, - trainable, - scope, - conv_dims=3) - -convolution3d.__doc__ = convolution.__doc__ @add_arg_scope def convolution2d_in_plane( @@ -1543,7 +1411,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None): Args: tensor: An `int` `Tensor` to be converted to a `Sparse`. eos_token: An integer. - It is part of the target label that signifies the end of a sentence. + It is part of the target label that signfies the end of a sentence. outputs_collections: Collection to add the outputs. scope: Optional scope for name_scope. """ @@ -1687,7 +1555,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None): output_collections: Collection to which the outputs will be added. scope: Optional scope for `name_scope`. Returns: - A `Tensor` or `SparseTensor` containing the same values as `inputs`, but + A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but with innermost dimensions flattened to obtain rank `new_rank`. Raises: diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index b01fd5d5c95..997f910a2a9 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -310,17 +310,6 @@ class BiasAddTest(test.TestCase): class ConvolutionTest(test.TestCase): - def testInvalidShape(self): - with self.test_session(): - images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1) - with self.assertRaisesRegexp( - ValueError, 'Convolution expects input with rank 5, got 4'): - layers_lib.convolution3d(images_2d, 32, 3) - images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1) - with self.assertRaisesRegexp( - ValueError, 'Convolution expects input with rank 4, got 5'): - layers_lib.convolution2d(images_3d, 32, 3) - def testInvalidDataFormat(self): height, width = 7, 9 with self.test_session(): @@ -3166,7 +3155,7 @@ class RepeatTests(test.TestCase): with self.test_session(): images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32) output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3]) - self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu') + self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu') self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32]) def testRepeatWithScope(self): @@ -3760,7 +3749,7 @@ class StackTests(test.TestCase): layers_lib.convolution2d, [10, 20, 30], kernel_size=[3, 3], padding='SAME') - self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu') + self.assertEqual(output.op.name, 'Stack/convolution_3/Relu') self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30]) def testStackWithScope(self): From b7bf05ade772a21bc9b74aa290a4493955ff2a1f Mon Sep 17 00:00:00 2001 From: ctiijima Date: Tue, 24 Apr 2018 14:17:14 -0700 Subject: [PATCH 0685/1734] typo fixes --- tensorflow/docs_src/get_started/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md index b28cb9df75d..578080bb592 100644 --- a/tensorflow/docs_src/get_started/index.md +++ b/tensorflow/docs_src/get_started/index.md @@ -10,13 +10,13 @@ course prior to diving into TensorFlow documentation: TensorFlow is a tool for machine learning. While it contains a wide range of functionality, TensorFlow is mainly designed for deep neural network models. -The easiest way to get started with tensorflow is using Eager Execution. +The easiest way to get started with TensorFlow is by using Eager Execution. - * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow. + * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow. TensorFlow provides many APIs. The remainder of this section focuses on the Estimator API which provide scalable, high-performance models. -To get started with Estimators begin by reading one of the following documents: +To get started with Estimators, begin by reading one of the following documents: * @{$get_started/get_started_for_beginners}, which is aimed at readers new to machine learning. From 7d1fe156d79cad6818a443d3e9473dd6abd4ab56 Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Tue, 24 Apr 2018 14:26:21 -0700 Subject: [PATCH 0686/1734] shape_tuple in array_ops.stack PiperOrigin-RevId: 194145557 --- tensorflow/python/ops/array_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index ceeabe090df..aba8beb3f4d 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -935,9 +935,9 @@ def stack(values, axis=0, name="stack"): except (TypeError, ValueError): pass # Input list contains non-constant tensors - value_shape = ops.convert_to_tensor(values[0], name=name).get_shape() - if value_shape.ndims is not None: - expanded_num_dims = value_shape.ndims + 1 + value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple() # pylint: disable=protected-access + if value_shape is not None: + expanded_num_dims = len(value_shape) + 1 if axis < -expanded_num_dims or axis >= expanded_num_dims: raise ValueError("axis = %d not in [%d, %d)" % (axis, -expanded_num_dims, expanded_num_dims)) From 1c9493f1b6aa56653b018ecf25af7040317fbb1b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 24 Apr 2018 14:32:39 -0700 Subject: [PATCH 0687/1734] Run shape inference directly on the graphdef instead of building an intermediate graph. PiperOrigin-RevId: 194146713 --- tensorflow/core/grappler/costs/BUILD | 2 + .../core/grappler/costs/graph_properties.cc | 552 +++++++++--------- .../core/grappler/costs/graph_properties.h | 26 +- .../grappler/costs/graph_properties_test.cc | 6 + tensorflow/core/grappler/graph_view.cc | 49 ++ tensorflow/core/grappler/graph_view.h | 36 +- 6 files changed, 372 insertions(+), 299 deletions(-) diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD index ddbf7f3697d..35f11eac295 100644 --- a/tensorflow/core/grappler/costs/BUILD +++ b/tensorflow/core/grappler/costs/BUILD @@ -42,6 +42,8 @@ cc_library( deps = [ ":utils", "//tensorflow/core/grappler/utils:topological_sort", + "//tensorflow/core/grappler:graph_view", + "//tensorflow/core/grappler:op_types", "//tensorflow/core:core_cpu_base", "//tensorflow/core:framework", "//tensorflow/core:lib", diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index ca30ad83a0c..e3c6c403063 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -19,10 +19,13 @@ limitations under the License. #include #include #include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/grappler/costs/utils.h" +#include "tensorflow/core/grappler/graph_view.h" +#include "tensorflow/core/grappler/op_types.h" #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/grappler/utils/topological_sort.h" #include "tensorflow/core/lib/strings/str_util.h" @@ -253,16 +256,16 @@ typename DisjointSet::Rep* DisjointSet::Find(Handle value) { return root; } -bool IsQueue(const Node& node) { - return str_util::EndsWith(node.type_string(), "QueueV2"); +bool IsQueue(const NodeDef& node) { + return str_util::EndsWith(node.op(), "QueueV2"); } // Returns true if the node is an Enter op AND its input is a Queue. -bool IsEnterWithQueue(const Node& node) { - if (node.IsEnter()) { - const Node* in_node; - TF_CHECK_OK(node.input_node(0, &in_node)); - return IsQueue(*in_node); +bool IsEnterWithQueue(const NodeDef& node, const GraphView& graph) { + if (IsEnter(node)) { + GraphView::InputPort input(&node, 0); + GraphView::OutputPort fanin = graph.GetRegularFanin(input); + return IsQueue(*fanin.node); } return false; } @@ -279,8 +282,9 @@ bool HasAnyUnknownDimensions(const TensorShapeProto& proto) { return false; } +// This really should be done in an external debugging tool void VerboseLogUnknownDimensionSources( - const Graph& graph, + const GraphDef& graph, const std::map>& input_properties_map, const std::map>& @@ -295,17 +299,13 @@ void VerboseLogUnknownDimensionSources( // do not have any unknown dimensions in their inputs, but // we have some unknown dimensions in their outputs. std::map op_to_count; - for (const Node* const node : graph.nodes()) { - if (node->num_outputs() == 0) { - continue; - } - - const auto& input_properties = input_properties_map.at(node->name()); - const auto& output_properties = output_properties_map.at(node->name()); + for (const NodeDef& node : graph.node()) { + const auto& input_properties = input_properties_map.at(node.name()); + const auto& output_properties = output_properties_map.at(node.name()); bool has_unknown_inputs = false; - for (int i = 0; i < node->num_inputs(); ++i) { - if (HasAnyUnknownDimensions(input_properties[i].shape())) { + for (const auto& input_prop : input_properties) { + if (HasAnyUnknownDimensions(input_prop.shape())) { has_unknown_inputs = true; break; } @@ -315,26 +315,24 @@ void VerboseLogUnknownDimensionSources( continue; } - for (int i = 0; i < node->num_outputs(); ++i) { - if (HasAnyUnknownDimensions(output_properties[i].shape())) { + for (const auto& output_prop : output_properties) { + if (HasAnyUnknownDimensions(output_prop.shape())) { string inputs = "input_shapes=["; - for (int i = 0; i < node->num_inputs(); ++i) { - inputs += - PartialTensorShape::DebugString(input_properties[i].shape()); + for (const auto& input_prop : input_properties) { + inputs += PartialTensorShape::DebugString(input_prop.shape()); } inputs += "]"; string outputs = "output_shapes=["; - for (int i = 0; i < node->num_outputs(); ++i) { - outputs += - PartialTensorShape::DebugString(output_properties[i].shape()); + for (const auto& output_prop : output_properties) { + outputs += PartialTensorShape::DebugString(output_prop.shape()); } outputs += "]"; - VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op() - << ", " << inputs << ", " << outputs; + VLOG(2) << "Node: " << node.name() << ", Op: " << node.op() << ", " + << inputs << ", " << outputs; - op_to_count[node->def().op()]++; + op_to_count[node.op()]++; // don't log again for this node break; @@ -357,13 +355,13 @@ void VerboseLogUnknownDimensionSources( // information is refined. class TopoQueue { public: - explicit TopoQueue(const std::unordered_map& topo_order) + explicit TopoQueue(const std::unordered_map& topo_order) : queue_(CompareNodes(topo_order)) {} - void push(const Node* n) { queue_.insert(n); } - const Node* pop() { + void push(const NodeDef* n) { queue_.insert(n); } + const NodeDef* pop() { CHECK(!empty()); auto it = queue_.begin(); - const Node* n = *it; + const NodeDef* n = *it; queue_.erase(it); return n; } @@ -376,16 +374,16 @@ class TopoQueue { // use their id to ensure they're sorted topologically. struct CompareNodes { explicit CompareNodes( - const std::unordered_map& topo_ordering) + const std::unordered_map& topo_ordering) : topo_order(topo_ordering) {} - bool operator()(const Node* lhs, const Node* rhs) const { + bool operator()(const NodeDef* lhs, const NodeDef* rhs) const { return topo_order.at(lhs) < topo_order.at(rhs); } private: - const std::unordered_map& topo_order; + const std::unordered_map& topo_order; }; - std::set queue_; + std::set queue_; }; // Merge and relax symbolic shapes. @@ -396,22 +394,41 @@ class TopoQueue { class SymbolicShapeRefiner { public: explicit SymbolicShapeRefiner( - const GraphDef& graph, + const GraphView& graph, const std::unordered_map>& fed_ports) - : function_library_(OpRegistry::Global(), graph.library()), + : graph_(graph), + function_library_(OpRegistry::Global(), graph.GetGraph()->library()), fed_ports_(fed_ports) { - graph_def_version_ = graph.versions().producer(); - node_to_context_.reserve(graph.node_size()); + graph_def_version_ = graph.GetGraph()->versions().producer(); + node_to_context_.reserve(graph.GetGraph()->node_size()); } - InferenceContext* GetContext(const Node* node) { + const GraphView& graph() const { return graph_; } + + struct NodeContext { + const OpRegistrationData* op_data; + DataTypeVector input_types; + DataTypeVector output_types; + std::unique_ptr inference_context; + std::vector output_tensors_as_shapes; + }; + + NodeContext* GetNodeContext(const NodeDef* node) { + auto it = node_to_context_.find(node); + if (it == node_to_context_.end()) { + return nullptr; + } + return &it->second; + } + + InferenceContext* GetContext(const NodeDef* node) { auto it = node_to_context_.find(node); if (it == node_to_context_.end()) { return nullptr; } return it->second.inference_context.get(); } - Status UpdateNode(const Node* node, bool relax, bool* refined) { + Status UpdateNode(const NodeDef* node, bool relax, bool* refined) { NodeContext* node_context = GetNodeContext(node); if (node_context == nullptr) { TF_RETURN_IF_ERROR(AddNode(node)); @@ -421,82 +438,84 @@ class SymbolicShapeRefiner { // Check if the shapes of the nodes in the fan-in of this node have changed, // and if they have, update the node input shapes. InferenceContext* inference_context = node_context->inference_context.get(); - std::vector const_values(node->num_inputs()); - std::vector input_tensors(node->num_inputs(), nullptr); - std::vector input_tensors_as_shapes(node->num_inputs()); + std::vector const_values(inference_context->num_inputs()); + std::vector input_tensors(inference_context->num_inputs(), + nullptr); + std::vector input_tensors_as_shapes( + inference_context->num_inputs()); - for (const Edge* e : node->in_edges()) { - if (e->IsControlEdge()) continue; + for (int dst_input = 0; dst_input < inference_context->num_inputs(); + ++dst_input) { + GraphView::InputPort port(node, dst_input); + for (const GraphView::OutputPort fanin : graph_.GetFanin(port)) { + int src_output = fanin.port_id; + const NodeDef* input = fanin.node; + NodeContext* c = GetNodeContext(input); + if (c == nullptr) { + return errors::FailedPrecondition( + "Input ", dst_input, " ('", input->name(), "') for '", + node->name(), "' was not previously added to ShapeRefiner."); + } - int dst_input = e->dst_input(); - int src_output = e->src_output(); - - Node* input = e->src(); - NodeContext* c = GetNodeContext(input); - if (c == nullptr) { - return errors::FailedPrecondition( - "Input ", dst_input, " ('", input->name(), "') for '", node->name(), - "' was not previously added to ShapeRefiner."); - } - - if (input->IsConstant()) { - // Convert constant value into tensors. - if (const_values[dst_input].FromProto( - input->def().attr().at("value").tensor())) { - input_tensors[dst_input] = &const_values[dst_input]; - // Integer tensors of rank one can also be interpreted as a shape - // provided all their values are >= -1. - if (const_values[dst_input].dims() == 1 && - (const_values[dst_input].dtype() == DT_INT32 || - const_values[dst_input].dtype() == DT_INT64)) { - ShapeHandle tensor_shape = inference_context->Vector( - const_values[dst_input].NumElements()); - ShapeHandle shp; - if (inference_context - ->MakeShapeFromTensor(input_tensors[dst_input], - tensor_shape, &shp) - .ok()) { - input_tensors_as_shapes[dst_input] = shp; + if (IsConstant(*input)) { + // Convert constant value into tensors. + if (const_values[dst_input].FromProto( + input->attr().at("value").tensor())) { + input_tensors[dst_input] = &const_values[dst_input]; + // Integer tensors of rank one can also be interpreted as a shape + // provided all their values are >= -1. + if (const_values[dst_input].dims() == 1 && + (const_values[dst_input].dtype() == DT_INT32 || + const_values[dst_input].dtype() == DT_INT64)) { + ShapeHandle tensor_shape = inference_context->Vector( + const_values[dst_input].NumElements()); + ShapeHandle shp; + if (inference_context + ->MakeShapeFromTensor(input_tensors[dst_input], + tensor_shape, &shp) + .ok()) { + input_tensors_as_shapes[dst_input] = shp; + } } } } - } - if (c->output_tensors_as_shapes.size() > src_output) { - input_tensors_as_shapes[dst_input] = - c->output_tensors_as_shapes[src_output]; - } + if (c->output_tensors_as_shapes.size() > src_output) { + input_tensors_as_shapes[dst_input] = + c->output_tensors_as_shapes[src_output]; + } - DCHECK_GE(dst_input, 0); - if (!*refined && !inference_context->input(dst_input).SameHandle( - c->inference_context->output(src_output))) { - *refined = true; - } - inference_context->SetInput(dst_input, - c->inference_context->output(src_output)); - - if (!*refined && - inference_context->requested_input_tensor_as_partial_shape( - dst_input)) { - // The input value may have changed. Since we have no way to know if - // that's indeed the case, err on the safe side. - *refined = true; - } - - // Also propagate handle shape and dtype of edges which are carrying - // resource handles. - if (e->src()->output_type(src_output) == DT_RESOURCE) { - auto* outputs = - c->inference_context->output_handle_shapes_and_types(src_output); - if (!outputs) continue; - auto* inputs = - inference_context->input_handle_shapes_and_types(dst_input); - - if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) { + DCHECK_GE(dst_input, 0); + if (!*refined && !inference_context->input(dst_input).SameHandle( + c->inference_context->output(src_output))) { *refined = true; } - inference_context->set_input_handle_shapes_and_types(dst_input, - *outputs); + inference_context->SetInput(dst_input, + c->inference_context->output(src_output)); + + if (!*refined && + inference_context->requested_input_tensor_as_partial_shape( + dst_input)) { + // The input value may have changed. Since we have no way to know if + // that's indeed the case, err on the safe side. + *refined = true; + } + + // Also propagate handle shape and dtype of edges which are carrying + // resource handles. + if (node_context->input_types[dst_input] == DT_RESOURCE) { + auto* outputs = + c->inference_context->output_handle_shapes_and_types(src_output); + if (!outputs) continue; + auto* inputs = + inference_context->input_handle_shapes_and_types(dst_input); + + if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) { + *refined = true; + } + inference_context->set_input_handle_shapes_and_types(dst_input, + *outputs); + } } } @@ -510,10 +529,10 @@ class SymbolicShapeRefiner { input_tensors_as_shapes); // Update the shapes of the outputs. - return InferShapes(node, node_context); + return InferShapes(*node, node_context); } - Status SetUnknownShape(const Node* node, int output_port) { + Status SetUnknownShape(const NodeDef* node, int output_port) { shape_inference::ShapeHandle shape = GetUnknownOutputShape(node, output_port); InferenceContext* ctx = GetContext(node); @@ -525,7 +544,7 @@ class SymbolicShapeRefiner { } struct ShapeId { - const Node* node; + const NodeDef* node; int port_id; bool operator==(const ShapeId& other) const { return node == other.node && port_id == other.port_id; @@ -533,12 +552,12 @@ class SymbolicShapeRefiner { }; struct HashShapeId { std::size_t operator()(const ShapeId& shp) const { - return std::hash{}(shp.node) + shp.port_id; + return std::hash{}(shp.node) + shp.port_id; } }; struct DimId { - const Node* node; + const NodeDef* node; int port_id; int dim_index; bool operator==(const DimId& other) const { @@ -549,13 +568,14 @@ class SymbolicShapeRefiner { struct HashDimId { std::size_t operator()(const DimId& dim) const { - return std::hash{}(dim.node) + dim.port_id + dim.dim_index; + return std::hash{}(dim.node) + dim.port_id + + dim.dim_index; } }; // Compute the shape of the tensors outputed by node 'node' at output port // 'port_index' as the intersection of shape1 and shape2. - ShapeHandle OutputAsIntersection(const Node* node, int port_index, + ShapeHandle OutputAsIntersection(const NodeDef* node, int port_index, ShapeHandle shape1, ShapeHandle shape2) { if (shape1.SameHandle(shape2)) { return shape1; @@ -600,7 +620,7 @@ class SymbolicShapeRefiner { // Compute the shape of the tensors outputed by node 'node' at output port // 'port_index' as the union of shape1 and shape2. - ShapeHandle OutputAsUnion(const Node* node, int port_index, + ShapeHandle OutputAsUnion(const NodeDef* node, int port_index, ShapeHandle shape1, ShapeHandle shape2) { if (shape1.SameHandle(shape2)) { return shape1; @@ -670,20 +690,24 @@ class SymbolicShapeRefiner { return true; } - Status AddNode(const Node* node) { + Status AddNode(const NodeDef* node) { + NodeContext& node_ctx = node_to_context_[node]; + TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data)); + + TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def, + &node_ctx.input_types, + &node_ctx.output_types)); + // Create the inference context for this node. - std::vector input_shapes(node->num_inputs()); + const int num_inputs = node_ctx.input_types.size(); + std::vector input_shapes(num_inputs); std::vector>> - input_handle_shapes_and_types(node->num_inputs()); - std::vector input_tensors(node->num_inputs(), nullptr); + input_handle_shapes_and_types(num_inputs); + std::vector input_tensors(num_inputs, nullptr); std::vector input_tensors_as_shapes; - NodeContext& node_ctx = node_to_context_[node]; - TF_RETURN_IF_ERROR( - function_library_.LookUp(node->type_string(), &node_ctx.op_data)); - node_ctx.inference_context.reset(new InferenceContext( - graph_def_version_, &node->def(), node->op_def(), input_shapes, + graph_def_version_, node, node_ctx.op_data->op_def, input_shapes, input_tensors, input_tensors_as_shapes, std::move(input_handle_shapes_and_types))); const Status s = node_ctx.inference_context->construction_status(); @@ -696,7 +720,7 @@ class SymbolicShapeRefiner { private: // Return the one ShapeHandle used to denote a fully unknown shape for a node // output. - ShapeHandle GetUnknownOutputShape(const Node* node, int index) { + ShapeHandle GetUnknownOutputShape(const NodeDef* node, int index) { ShapeId id{node, index}; auto it = unknown_shapes_.find(id); if (it != unknown_shapes_.end()) { @@ -709,7 +733,8 @@ class SymbolicShapeRefiner { } // Return the one ShapeHandle used to denote a fully unknown dimension for a // node output. - DimensionHandle GetUnknownOutputDim(const Node* node, int index, int dim_id) { + DimensionHandle GetUnknownOutputDim(const NodeDef* node, int index, + int dim_id) { DimId id{node, index, dim_id}; auto it = unknown_dims_.find(id); if (it != unknown_dims_.end()) { @@ -721,31 +746,25 @@ class SymbolicShapeRefiner { return dim; } - struct NodeContext { - const OpRegistrationData* op_data; - std::unique_ptr inference_context; - std::vector output_tensors_as_shapes; - }; - - Status InferShapes(const Node* node, NodeContext* c) { + Status InferShapes(const NodeDef& node, NodeContext* c) { InferenceContext* ic = c->inference_context.get(); - auto it = fed_ports_.find(node->name()); + auto it = fed_ports_.find(node.name()); const bool is_fed = it != fed_ports_.end(); // Propagate shape tensors unless the node is fed. // TODO(bsteiner) We should still propagate the shapes to the ports that // aren't fed in the case of a ShapeN node. if (!is_fed) { - if (node->type_string() == "Shape") { + if (IsShape(node)) { c->output_tensors_as_shapes.resize(1); c->output_tensors_as_shapes[0] = c->inference_context->input(0); - } else if (node->type_string() == "ShapeN") { + } else if (IsShapeN(node)) { c->output_tensors_as_shapes.resize(c->inference_context->num_inputs()); for (int i = 0; i < c->inference_context->num_inputs(); ++i) { c->output_tensors_as_shapes[i] = c->inference_context->input(i); } - } else if (node->type_string() == "ConcatV2") { + } else if (node.op() == "ConcatV2") { bool valid = true; ShapeHandle result; for (int i = 0; i < ic->num_inputs() - 1; ++i) { @@ -763,7 +782,7 @@ class SymbolicShapeRefiner { c->output_tensors_as_shapes.resize(1); c->output_tensors_as_shapes[0] = result; } - } else if (node->type_string() == "Slice") { + } else if (IsSlice(node)) { ShapeHandle input = ic->input_tensors_as_shapes()[0]; bool valid = ic->RankKnown(input); const Tensor* slice_offset = ic->input_tensor(1); @@ -800,22 +819,16 @@ class SymbolicShapeRefiner { // It is possible to feed node output ports with tensors of any shape: as // a result, the shape of a fed port is completely unknown. for (const int output_port : it->second) { - status.Update(SetUnknownShape(node, output_port)); + status.Update(SetUnknownShape(&node, output_port)); } } return status; } - NodeContext* GetNodeContext(const Node* node) { - auto it = node_to_context_.find(node); - if (it == node_to_context_.end()) { - return nullptr; - } - return &it->second; - } - + private: + const GraphView& graph_; int graph_def_version_; - std::unordered_map node_to_context_; + std::unordered_map node_to_context_; std::unordered_map unknown_shapes_; std::unordered_map unknown_dims_; FunctionLibraryDefinition function_library_; @@ -874,7 +887,7 @@ class SymbolicShapeManager { }; Status GraphProperties::MergeEnqueueShapesAndTypes( - SymbolicShapeRefiner* shape_refiner, const Node* qnode, + SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode, const std::vector& shapes_and_types, std::vector* queue_shapes_and_types) { if (shapes_and_types.size() != queue_shapes_and_types->size()) { @@ -897,7 +910,7 @@ Status GraphProperties::MergeEnqueueShapesAndTypes( } Status GraphProperties::RelaxEnqueueShapesAndMergeTypes( - SymbolicShapeRefiner* shape_refiner, const Node* qnode, + SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode, const std::vector& shapes_and_types, std::vector* queue_shapes_and_types) { if (shapes_and_types.size() != queue_shapes_and_types->size()) { @@ -925,7 +938,7 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes( // inputs are UnknownShapes. So we need to ignore the input from NextIteration // nodes to propagate any known shape from the Merge node. Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, - const Node* node, bool relax, + const NodeDef* node, bool relax, bool* new_shapes) const { InferenceContext* c = shape_refiner->GetContext(node); if (!c) { @@ -942,25 +955,24 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, ShapeHandle out; bool out_initialized = false; - for (const Edge* e : node->in_edges()) { - if (e->IsControlEdge()) { - continue; - } + for (const GraphView::Edge fanin : + shape_refiner->graph().GetFaninEdges(*node, false)) { // Skip back edges during the initial propagation phase. This is equivalent // to assuming that all the inputs to the merge nodes are fed by the same // shape, and will be corrected as needed in the relaxation phase. - if (!relax && e->src()->IsNextIteration()) { + if (!relax && IsNextIteration(*fanin.src.node)) { continue; } - InferenceContext* in = shape_refiner->GetContext(e->src()); + InferenceContext* in = shape_refiner->GetContext(fanin.src.node); if (!relax && !in) { // Handling a loop for the first time, the back edge won't have any shape // info. continue; } - ShapeHandle input = in->output(e->src_output()); - c->SetInput(e->dst_input(), input); + ShapeHandle input = in->output(fanin.src.port_id); + CHECK_EQ(fanin.tgt.node, node); + c->SetInput(fanin.tgt.port_id, input); if (!out_initialized) { out_initialized = true; out = input; @@ -984,7 +996,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, // Manually propagate the input shape for Enter nodes and update any Merge node // outputs. Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner, - const Node* node, bool relax, + const NodeDef* node, bool relax, bool* new_shapes) { auto enter_ctx = shape_refiner->GetContext(node); if (!enter_ctx) { @@ -992,33 +1004,27 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner, enter_ctx = shape_refiner->GetContext(node); } - for (const Edge* e : node->in_edges()) { - if (e->IsControlEdge()) { - continue; - } - InferenceContext* in = shape_refiner->GetContext(e->src()); - ShapeHandle input = in->output(e->src_output()); - if (!enter_ctx->output(0).SameHandle(input)) { - if (relax) { - enter_ctx->RelaxInput(0, input); - } else { - enter_ctx->MergeInput(0, input); - } - enter_ctx->set_output(0, input); - *new_shapes = true; - } + GraphView::InputPort inp(node, 0); + GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp); + + InferenceContext* in = shape_refiner->GetContext(fanin.node); + ShapeHandle input = in->output(fanin.port_id); + if (!enter_ctx->output(0).SameHandle(input)) { + enter_ctx->SetInput(0, input); + enter_ctx->set_output(0, input); + *new_shapes = true; } return Status::OK(); } -Status GraphProperties::UpdateShapes( - SymbolicShapeRefiner* shape_refiner, bool relax, - const Node* n, bool* new_shapes) const { - if (n->IsEnter()) { +Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner, + bool relax, const NodeDef* n, + bool* new_shapes) const { + if (IsEnter(*n)) { // The Enter shape function always forwards an UnknownShape, so do the right // thing here. TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes)); - } else if (n->IsMerge()) { + } else if (IsMerge(*n)) { // Properly handle merge nodes. TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes)); } else { @@ -1028,7 +1034,7 @@ Status GraphProperties::UpdateShapes( if (updated) { // We want to avoid propagating through loops on the merge pass because // the shapes are not guaranteed to converge. - if (relax || !n->IsNextIteration()) { + if (relax || !IsNextIteration(*n)) { *new_shapes = true; } } @@ -1039,8 +1045,8 @@ Status GraphProperties::UpdateShapes( // Propagates the shapes in the transitive fan-out of . Status GraphProperties::PropagateShapes( SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes, - const std::unordered_map>& - resources, + const std::unordered_map>& resources, int num_loops) const { // Limit the number of iterations to prevent infinite loops in the presence of // incorrect shape functions. The algoritm should converge in at most @@ -1062,15 +1068,13 @@ Status GraphProperties::PropagateShapes( int64 num_loop_iterations = 0; while (!new_shapes->empty() && num_loop_iterations++ < max_loop_iterations) { - const Node* n = new_shapes->pop(); + const NodeDef* n = new_shapes->pop(); bool updated = false; TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated)); if (updated) { - for (const Edge* e : n->out_edges()) { - if (!e->IsControlEdge()) { - const Node* fanout = e->dst(); - new_shapes->push(fanout); - } + for (const GraphView::InputPort fanout : + shape_refiner->graph().GetFanouts(*n, false)) { + new_shapes->push(fanout.node); } } } @@ -1093,10 +1097,11 @@ Status GraphProperties::PropagateShapes( } Status GraphProperties::UpdateResource( - const Node* qnode, const std::unordered_set& queue_inputs, + const NodeDef* qnode, + const std::unordered_set& queue_inputs, SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) { // Proceed only if qnode is a queue or an Enter with queue input. - if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) { + if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode, shape_refiner->graph())) { return Status::OK(); } auto qctx = shape_refiner->GetContext(qnode); @@ -1109,16 +1114,17 @@ Status GraphProperties::UpdateResource( // are in. std::vector queue_shapes_and_types; for (const auto& node : queue_inputs) { - auto ctx = shape_refiner->GetContext(node); + auto ctx = shape_refiner->GetNodeContext(node); if (!ctx) { continue; } // TODO(bsteiner): handle EnqueueMany as well. - if (node->type_string().find("Enqueue") != std::string::npos && - node->type_string().find("EnqueueMany") == std::string::npos) { + if (node->op().find("Enqueue") != std::string::npos && + node->op().find("EnqueueMany") == std::string::npos) { std::vector shapes_and_types; - for (int i = 1; i < ctx->num_inputs(); ++i) { - shapes_and_types.push_back({ctx->input(i), node->input_type(i)}); + for (int i = 1; i < ctx->input_types.size(); ++i) { + shapes_and_types.push_back( + {ctx->inference_context->input(i), ctx->input_types[i]}); } if (queue_shapes_and_types.empty()) { queue_shapes_and_types = shapes_and_types; @@ -1134,11 +1140,9 @@ Status GraphProperties::UpdateResource( queue_shapes_and_types)) { qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types); - for (const Edge* e : qnode->out_edges()) { - if (!e->IsControlEdge()) { - const Node* fanout = e->dst(); - new_shapes->push(fanout); - } + for (const GraphView::InputPort fanout : + shape_refiner->graph().GetFanouts(*qnode, false)) { + new_shapes->push(fanout.node); } } @@ -1148,18 +1152,6 @@ Status GraphProperties::UpdateResource( Status GraphProperties::InferStatically(bool assume_valid_feeds) { FunctionLibraryDefinition function_library(OpRegistry::Global(), item_.graph.library()); - Graph graph(function_library); - graph_ = &graph; - ImportGraphDefOptions options; - // Graph optimization happens at the late stage of graph execution, - // when colocation constraints are already validated previously and - // the device placement of nodes has also completed, so there - // is no need to validate colocation constraints again. - options.validate_colocation_constraints = false; - options.validate_shape = false; - Status s = ImportGraphDef(options, item_.graph, &graph, nullptr); - TF_RETURN_IF_ERROR(s); - std::unordered_map> fed_ports; if (!assume_valid_feeds) { for (const auto& feed : item_.feed) { @@ -1172,46 +1164,45 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { std::unordered_map topo_order; TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order)); - std::unordered_map order_by_name; - for (const auto topo : topo_order) { - order_by_name[topo.first->name()] = topo.second; - } + GraphView graph_view(&item_.graph); - // List the resources and the nodes using them. Also collect the Enter and - // Merge nodes. - std::unordered_map graph_topo_order; - std::unordered_map> resources; - std::unordered_set merge_nodes; - std::unordered_set fed_nodes; - std::unordered_set primary_inputs; + // List the resources and the nodes using them. Also collect the Merge nodes, + // fed nodes, and primary inputs. + std::unordered_map> + resources; + std::unordered_set merge_nodes; + std::unordered_set fed_nodes; + std::unordered_set primary_inputs; int num_loops = 0; - for (const Node* const node : graph.nodes()) { - auto it = order_by_name.find(node->name()); - if (it == order_by_name.end()) { - continue; - } - graph_topo_order[node] = it->second; - - for (int i = 0; i < node->num_inputs(); ++i) { - if (node->input_type(i) == DataType::DT_RESOURCE) { - const Node* resource; - TF_CHECK_OK(node->input_node(i, &resource)); - resources[resource].insert(node); + for (const NodeDef& node : item_.graph.node()) { + if (NumNonControlInputs(node) == 0) { + primary_inputs.insert(&node); + } else if (IsMerge(node)) { + merge_nodes.insert(&node); + } else if (IsNextIteration(node)) { + ++num_loops; + } else { + const OpRegistrationData* op_data; + TF_RETURN_IF_ERROR(function_library.LookUp(node.op(), &op_data)); + DataTypeVector input_types; + DataTypeVector output_types; + TF_RETURN_IF_ERROR(InOutTypesForNode(node, op_data->op_def, &input_types, + &output_types)); + for (int i = 0; i < input_types.size(); ++i) { + if (input_types[i] == DataType::DT_RESOURCE) { + GraphView::InputPort input(&node, i); + const GraphView::OutputPort resource = + graph_view.GetRegularFanin(input); + resources[resource.node].insert(&node); + } } } - if (node->num_inputs() == 0) { - primary_inputs.insert(node); - } else if (node->IsMerge()) { - merge_nodes.insert(node); - } else if (node->IsNextIteration()) { - ++num_loops; - } - if (fed_ports.find(node->name()) != fed_ports.end()) { - fed_nodes.insert(node); + if (fed_ports.find(node.name()) != fed_ports.end()) { + fed_nodes.insert(&node); } } - SymbolicShapeRefiner refiner(item_.graph, fed_ports); + SymbolicShapeRefiner refiner(graph_view, fed_ports); // We propagate shapes through the graph in two phases. In the first phase, we // exclusively merge shapes but we do not propagate shapes through the @@ -1219,19 +1210,19 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { // we exclusively relax shapes and propagate shapes through loops until // reaching fixed point. for (int relax = 0; relax < 2; relax++) { - TopoQueue new_shapes(graph_topo_order); + TopoQueue new_shapes(topo_order); // Seed the propagation of shapes through merge nodes. if (relax) { - for (const Node* node : merge_nodes) { + for (const NodeDef* node : merge_nodes) { new_shapes.push(node); } } // Also seed the propagation of shapes in the fanout of primary inputs. - for (const Node* node : primary_inputs) { + for (const NodeDef* node : primary_inputs) { new_shapes.push(node); } // Also seed the propagation of shapes in the fanout of fed nodes. - for (const Node* node : fed_nodes) { + for (const NodeDef* node : fed_nodes) { new_shapes.push(node); } // Propagate shapes normally. @@ -1242,14 +1233,14 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { // Track shapes globally across the graph. SymbolicShapeManager shape_manager; bool found_error = false; - for (const Node* const node : graph.nodes()) { - auto node_ctx = refiner.GetContext(node); + for (const NodeDef& node : item_.graph.node()) { + auto node_ctx = refiner.GetContext(&node); if (!node_ctx) { continue; } // Skip any information that comes from fed nodes. - if (fed_ports.find(node->name()) != fed_ports.end()) { - VLOG(2) << "Skipping feed node shape: " << node->name(); + if (fed_ports.find(node.name()) != fed_ports.end()) { + VLOG(2) << "Skipping feed node shape: " << node.name(); continue; } for (const auto& merged_shapes : node_ctx->MergedShapes()) { @@ -1273,61 +1264,56 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) { } } - for (const Node* const node : graph.nodes()) { - VLOG(3) << "Filling in graph properties for node: " << node->name(); - auto ctx = refiner.GetContext(node); + for (const NodeDef& node : item_.graph.node()) { + VLOG(3) << "Filling in graph properties for node: " << node.name(); + auto ctx = refiner.GetNodeContext(&node); if (!ctx) { continue; } // Fill input properties. { - CHECK_EQ(ctx->num_inputs(), node->num_inputs()); - auto& input_properties = input_properties_[node->name()]; + // CHECK_EQ(ctx->num_inputs(), node.num_inputs()); + auto& input_properties = input_properties_[node.name()]; // Should always be empty, node names in graph are supposed to be unique. CHECK_EQ(input_properties.size(), 0); - input_properties.resize(ctx->num_inputs()); - for (int i = 0; i < ctx->num_inputs(); ++i) { - shape_manager.AsTensorProperties(ctx->input(i), node->input_type(i), + input_properties.resize(ctx->inference_context->num_inputs()); + GraphView::InputPort input(&node, -1); + for (int i = 0; i < ctx->inference_context->num_inputs(); ++i) { + shape_manager.AsTensorProperties(ctx->inference_context->input(i), + ctx->input_types[i], &input_properties[i]); - } - for (const auto& edge : node->in_edges()) { - if (edge->IsControlEdge()) { + input.port_id = i; + GraphView::OutputPort fanin = graph_view.GetRegularFanin(input); + if (!IsConstant(*fanin.node)) { continue; } - if (!edge->src()->IsConstant()) { - continue; - } - const int input_id = edge->dst_input(); - if (input_id >= input_properties.size()) { - continue; - } - const NodeDef& node = edge->src()->def(); - const TensorProto& raw_val = node.attr().at("value").tensor(); - *input_properties[input_id].mutable_value() = raw_val; + const TensorProto& raw_val = fanin.node->attr().at("value").tensor(); + *input_properties[i].mutable_value() = raw_val; } } // Fill output properties. { - CHECK_EQ(ctx->num_outputs(), node->num_outputs()); - auto& output_properties = output_properties_[node->name()]; + // CHECK_EQ(ctx->num_outputs(), node->num_outputs()); + auto& output_properties = output_properties_[node.name()]; // Should always be empty, node names in graph are supposed to be unique. CHECK_EQ(output_properties.size(), 0); - output_properties.resize(ctx->num_outputs()); - for (int i = 0; i < ctx->num_outputs(); ++i) { - shape_manager.AsTensorProperties(ctx->output(i), node->output_type(i), + output_properties.resize(ctx->inference_context->num_outputs()); + for (int i = 0; i < ctx->inference_context->num_outputs(); ++i) { + shape_manager.AsTensorProperties(ctx->inference_context->output(i), + ctx->output_types[i], &output_properties[i]); } } } // Help trace the unknown dimensions to their origins. - VerboseLogUnknownDimensionSources(graph, input_properties_, + VerboseLogUnknownDimensionSources(item_.graph, input_properties_, output_properties_); return Status::OK(); diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h index a4e3031db14..485324c4664 100644 --- a/tensorflow/core/grappler/costs/graph_properties.h +++ b/tensorflow/core/grappler/costs/graph_properties.h @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/core/grappler/grappler_item.h" namespace tensorflow { -class Graph; namespace grappler { @@ -79,40 +78,41 @@ class GraphProperties { // Merges shapes , determined from an EnqueueV2 node, into // <*queue_shapes_and_types>. static Status MergeEnqueueShapesAndTypes( - SymbolicShapeRefiner* shape_refiner, const Node* qnode, + SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode, const std::vector& shapes_and_types, std::vector* queue_shapes_and_types); // Relaxes shapes , determined from an EnqueueV2 node, into // <*queue_shapes_and_types>. static Status RelaxEnqueueShapesAndMergeTypes( - SymbolicShapeRefiner* shape_refiner, const Node* qnode, + SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode, const std::vector& shapes_and_types, std::vector* queue_shapes_and_types); // Update the shapes for qnode. If output shapes of qnode have changed, // enqueue its fanout in 'new_shapes'. static Status UpdateResource( - const Node* qnode, const std::unordered_set& queue_inputs, + const NodeDef* qnode, + const std::unordered_set& queue_inputs, SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes); // Update the output shapes of a Merge node, and enqueue its fanout in // new_shapes if needed. - Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, const Node* node, - bool relax, bool* new_shapes) const; + Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner, + const NodeDef* node, bool relax, + bool* new_shapes) const; // Process the Enter node, and enqueue its fanout in new_shapes if needed. static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner, - const Node* node, bool relax, bool* new_shapes); + const NodeDef* node, bool relax, bool* new_shapes); // Update the shapes for node 'n'. If output shapes for n have changed, // enqueue its fanout in 'new_shapes'. - Status UpdateShapes( - SymbolicShapeRefiner* shape_refiner, bool relax, - const Node* n, bool* new_shapes) const; + Status UpdateShapes(SymbolicShapeRefiner* shape_refiner, bool relax, + const NodeDef* n, bool* new_shapes) const; // Propagate the shapes for the nodes enqueued in new_shapes and their // transitive fanout until a fixed point is reached. Status PropagateShapes( SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes, - const std::unordered_map>& - resources, + const std::unordered_map>& resources, int num_loops) const; // Data members @@ -120,8 +120,6 @@ class GraphProperties { std::map> input_properties_; std::map> output_properties_; const std::vector missing_properties_; - - Graph* graph_; }; } // end namespace grappler diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc index 3de697bd372..afe334dfa2f 100644 --- a/tensorflow/core/grappler/costs/graph_properties_test.cc +++ b/tensorflow/core/grappler/costs/graph_properties_test.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/cc/framework/scope.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/tensor_testutil.h" @@ -955,6 +956,11 @@ TEST_F(GraphPropertiesTest, Performance) { string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath, "large_graph.pbtxt.html"); TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph)); + TF_CHECK_OK(AddDefaultAttrsToGraphDef( + &item.graph, + FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()), 0, + true)); + GraphProperties properties(item); TF_CHECK_OK(properties.InferStatically(false)); } diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc index 0d3f94854b6..3e448216f90 100644 --- a/tensorflow/core/grappler/graph_view.cc +++ b/tensorflow/core/grappler/graph_view.cc @@ -173,5 +173,54 @@ int GraphView::NumFanins(const NodeDef& node, return count; } +std::unordered_set +GraphView::GetFanoutEdges(const NodeDef& node, + bool include_controlled_edges) const { + std::unordered_set result; + OutputPort port; + port.node = const_cast(&node); + const int first_port_id = include_controlled_edges ? -1 : 0; + auto it = num_regular_outputs_.find(&node); + const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1; + + for (int i = first_port_id; i <= last_port_id; ++i) { + port.port_id = i; + auto it = fanouts_.find(port); + if (it != fanouts_.end()) { + Edge fanout; + fanout.src.node = const_cast(&node); + fanout.src.port_id = i; + for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) { + fanout.tgt = *itr; + result.insert(fanout); + } + } + } + return result; +} + +std::unordered_set +GraphView::GetFaninEdges(const NodeDef& node, + bool include_controlling_edges) const { + std::unordered_set result; + for (int i = 0; i < node.input_size(); ++i) { + Edge fanin; + fanin.tgt.node = const_cast(&node); + fanin.tgt.port_id = i; + string fanin_name = ParseNodeName(node.input(i), &fanin.src.port_id); + if (fanin.src.port_id < 0) { + if (!include_controlling_edges) { + break; + } + } + auto it = nodes_.find(fanin_name); + if (it != nodes_.end()) { + fanin.src.node = it->second; + result.insert(fanin); + } + } + return result; +} + } // end namespace grappler } // end namespace tensorflow diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h index 173ce9c09c2..c3baad09878 100644 --- a/tensorflow/core/grappler/graph_view.h +++ b/tensorflow/core/grappler/graph_view.h @@ -29,6 +29,8 @@ namespace grappler { class GraphView { public: struct Port { + Port() : node(nullptr), port_id(-1) {} + Port(NodeDef* n, int port) : node(n), port_id(port) {} NodeDef* node = nullptr; int port_id = -1; @@ -36,8 +38,16 @@ class GraphView { return node == other.node && port_id == other.port_id; } }; - struct InputPort : public Port {}; - struct OutputPort : public Port {}; + struct InputPort : public Port { + InputPort() = default; + InputPort(NodeDef* n, int port_id) : Port(n, port_id) {} + InputPort(const NodeDef* n, int port_id) + : Port(const_cast(n), port_id) {} + }; + struct OutputPort : public Port { + OutputPort() = default; + OutputPort(NodeDef* n, int port_id) : Port(n, port_id) {} + }; struct HashPort { std::size_t operator()(const Port& port) const { @@ -45,6 +55,20 @@ class GraphView { } }; + struct Edge { + OutputPort src; + InputPort tgt; + + bool operator==(const Edge& other) const { + return src == other.src && tgt == other.tgt; + } + }; + struct HashEdge { + std::size_t operator()(const Edge& edge) const { + return HashPort()(edge.src) + HashPort()(edge.tgt); + } + }; + explicit GraphView(GraphDef* graph); GraphDef* GetGraph() const { return graph_; } NodeDef* GetNode(const string& node_name) const; @@ -63,6 +87,7 @@ class GraphView { const OutputPort& port) const; std::unordered_set GetFanin( const InputPort& port) const; + // Special case: regular (i.e. non-control) input ports can only have one // fanin. const OutputPort GetRegularFanin(const InputPort& port) const; @@ -79,6 +104,13 @@ class GraphView { // controlling nodes iff include_controlling_nodes is true. int NumFanins(const NodeDef& node, bool include_controlling_nodes) const; + // Get all the edge in the immediate fanout (resp fanin) of a node. Include + // the control edges iff include_controlling_edges is true. + std::unordered_set GetFanoutEdges( + const NodeDef& node, bool include_controlled_edges) const; + std::unordered_set GetFaninEdges( + const NodeDef& node, bool include_controlling_edges) const; + private: GraphDef* graph_; std::unordered_map nodes_; From 3624fe7d063f8fa6fe5bd864ced291f520c54cdd Mon Sep 17 00:00:00 2001 From: Ruoxin Sang Date: Tue, 24 Apr 2018 14:42:07 -0700 Subject: [PATCH 0688/1734] Invalidate the StatCache as well as the FileBlockCache, as once the file is overwritten or removed, the stat will become outdated. PiperOrigin-RevId: 194148397 --- .../core/platform/cloud/expiring_lru_cache.h | 18 +++++++ .../platform/cloud/expiring_lru_cache_test.cc | 17 +++++++ .../core/platform/cloud/gcs_file_system.cc | 19 ++++--- .../core/platform/cloud/gcs_file_system.h | 3 ++ .../platform/cloud/gcs_file_system_test.cc | 50 +++++++++++++++++++ 5 files changed, 100 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h index c738497ddd5..e2d048f141c 100644 --- a/tensorflow/core/platform/cloud/expiring_lru_cache.h +++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h @@ -51,6 +51,14 @@ class ExpiringLRUCache { InsertLocked(key, value); } + // Delete the entry with key `key`. Return true if the entry was found for + // `key`, false if the entry was not found. In both cases, there is no entry + // with key `key` existed after the call. + bool Delete(const string& key) { + mutex_lock lock(mu_); + return DeleteLocked(key); + } + /// Look up the entry with key `key` and copy it to `value` if found. Returns /// true if an entry was found for `key`, and its timestamp is not more than /// max_age_ seconds in the past. @@ -141,6 +149,16 @@ class ExpiringLRUCache { } } + bool DeleteLocked(const string& key) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + auto it = cache_.find(key); + if (it == cache_.end()) { + return false; + } + lru_list_.erase(it->second.lru_iterator); + cache_.erase(it); + return true; + } + /// The maximum age of entries in the cache, in seconds. A value of 0 means /// that no entry is ever placed in the cache. const uint64 max_age_; diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc index 3bc6db38429..42879e80a9e 100644 --- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc +++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc @@ -174,5 +174,22 @@ TEST(ExpiringLRUCacheTest, Clear) { EXPECT_FALSE(cache.Lookup("d", &value)); } +TEST(ExpiringLRUCacheTest, Delete) { + // Insert an entry. + ExpiringLRUCache cache(1, 4); + cache.Insert("a", 1); + int value = 0; + EXPECT_TRUE(cache.Lookup("a", &value)); + EXPECT_EQ(value, 1); + + // Delete the entry. + EXPECT_TRUE(cache.Delete("a")); + EXPECT_FALSE(cache.Lookup("a", &value)); + + // Try deleting the entry again. + EXPECT_FALSE(cache.Delete("a")); + EXPECT_FALSE(cache.Lookup("a", &value)); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc index f0003fa7849..2d9c99c124a 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system.cc +++ b/tensorflow/core/platform/cloud/gcs_file_system.cc @@ -857,14 +857,20 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset, return Status::OK(); } +void GcsFileSystem::ClearFileCaches(const string& fname) { + file_block_cache_->RemoveFile(fname); + stat_cache_->Delete(fname); + // TODO(rxsang): Remove the patterns that matche the file in + // MatchingPathsCache as well. +} + Status GcsFileSystem::NewWritableFile(const string& fname, std::unique_ptr* result) { string bucket, object; TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object)); - result->reset(new GcsWritableFile( - bucket, object, this, &timeouts_, - [this, fname]() { file_block_cache_->RemoveFile(fname); }, - initial_retry_delay_usec_)); + result->reset(new GcsWritableFile(bucket, object, this, &timeouts_, + [this, fname]() { ClearFileCaches(fname); }, + initial_retry_delay_usec_)); return Status::OK(); } @@ -904,8 +910,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname, TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object)); result->reset(new GcsWritableFile( bucket, object, this, old_content_filename, &timeouts_, - [this, fname]() { file_block_cache_->RemoveFile(fname); }, - initial_retry_delay_usec_)); + [this, fname]() { ClearFileCaches(fname); }, initial_retry_delay_usec_)); return Status::OK(); } @@ -1277,7 +1282,7 @@ Status GcsFileSystem::DeleteFile(const string& fname) { request->SetDeleteRequest(); TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname); - file_block_cache_->RemoveFile(fname); + ClearFileCaches(fname); return Status::OK(); } diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h index 703c8d57784..99c94c17515 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system.h +++ b/tensorflow/core/platform/cloud/gcs_file_system.h @@ -227,6 +227,9 @@ class GcsFileSystem : public FileSystem { Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n, char* buffer, size_t* bytes_transferred); + // Clear all the caches related to the file with name `filename`. + void ClearFileCaches(const string& fname); + std::unique_ptr auth_provider_; std::unique_ptr http_request_factory_; std::unique_ptr file_block_cache_; diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc index ca4b7722b62..c6392999543 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc +++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc @@ -1551,6 +1551,56 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) { fs.DeleteFile("gs://bucket/").code()); } +TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) { + std::vector requests( + {new FakeHttpRequest( + "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/" + "file.txt?fields=size%2Cupdated\n" + "Auth Token: fake_token\n" + "Timeouts: 5 1 10\n", + strings::StrCat("{\"size\": \"1010\"," + "\"updated\": \"2016-04-29T23:15:24.896Z\"}")), + new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b" + "/bucket/o/file.txt\n" + "Auth Token: fake_token\n" + "Timeouts: 5 1 10\n" + "Delete: yes\n", + ""), + new FakeHttpRequest( + "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/" + "file.txt?fields=size%2Cupdated\n" + "Auth Token: fake_token\n" + "Timeouts: 5 1 10\n", + "", errors::NotFound("404"), 404), + new FakeHttpRequest( + "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?" + "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F" + "&maxResults=1\n" + "Auth Token: fake_token\n" + "Timeouts: 5 1 10\n", + "{}")}); + GcsFileSystem fs( + std::unique_ptr(new FakeAuthProvider), + std::unique_ptr( + new FakeHttpRequestFactory(&requests)), + 16 /* block size */, 16 /* max bytes */, 0 /* max staleness */, + 3600 /* stat cache max age */, 0 /* stat cache max entries */, + 0 /* matching paths cache max age */, + 0 /* matching paths cache max entries */, 0 /* initial retry delay*/, + kTestTimeoutConfig, nullptr /* gcs additional header */); + + // Stats the file first so the stat is cached. + FileStatistics stat_before_deletion; + TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat_before_deletion)); + EXPECT_EQ(1010, stat_before_deletion.length); + + TF_EXPECT_OK(fs.DeleteFile("gs://bucket/file.txt")); + + FileStatistics stat_after_deletion; + EXPECT_EQ(error::Code::NOT_FOUND, + fs.Stat("gs://bucket/file.txt", &stat_after_deletion).code()); +} + TEST(GcsFileSystemTest, DeleteDir_Empty) { std::vector requests({new FakeHttpRequest( "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?" From 61c463020618ef6441392db770bdb0ec23375c73 Mon Sep 17 00:00:00 2001 From: Nick Felt Date: Tue, 24 Apr 2018 14:51:20 -0700 Subject: [PATCH 0689/1734] Update tensorboard dep to 1.8.x --- tensorflow/tools/pip_package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 6da3223d339..bcf6c1e5158 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -38,7 +38,7 @@ REQUIRED_PACKAGES = [ 'numpy >= 1.13.3', 'six >= 1.10.0', 'protobuf >= 3.4.0', - 'tensorboard >= 1.7.0, < 1.8.0', + 'tensorboard >= 1.8.0, < 1.9.0', 'termcolor >= 1.1.0', ] From 03005b129691bf6db8cf8c8c5a82be70ac79571c Mon Sep 17 00:00:00 2001 From: Billy Lamberta Date: Tue, 24 Apr 2018 14:52:38 -0700 Subject: [PATCH 0690/1734] docs: install_linux, move GPU section below install procedures. --- tensorflow/docs_src/install/install_linux.md | 198 +++++++++---------- 1 file changed, 98 insertions(+), 100 deletions(-) diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index fa82ac9c40a..c66d50c3cb1 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -1,106 +1,25 @@ # Installing TensorFlow on Ubuntu -This guide explains how to install TensorFlow on Ubuntu. Although these -instructions might also work on other Linux variants, we have only -tested (and we only support) these instructions on machines meeting the -following requirements: +This guide explains how to install TensorFlow on Ubuntu Linux. While these +instructions may work on other Linux variants, they are tested and supported with +the following system requirements: - * 64-bit desktops or laptops - * Ubuntu 16.04 or higher +* 64-bit desktops or laptops +* Ubuntu 16.04 or higher -## Determine which TensorFlow to install +## Choose which TensorFlow to install -You must choose one of the following types of TensorFlow to install: +The following TensorFlow variants are available for installation: - * **TensorFlow with CPU support only**. If your system does not have a - NVIDIA® GPU, you must install this version. Note that this version of - TensorFlow is typically much easier to install (typically, - in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend - installing this version first. - * **TensorFlow with GPU support**. TensorFlow programs typically run - significantly faster on a GPU than on a CPU. Therefore, if your - system has a NVIDIA® GPU meeting the prerequisites shown below and you - need to run performance-critical applications, you should ultimately - install this version. - - -### NVIDIA requirements to run TensorFlow with GPU support - -If you are installing TensorFlow with GPU support using one of the -mechanisms described in this guide, then the following NVIDIA software -must be installed on your system: - - * [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see - [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/). - Ensure that you append the relevant CUDA pathnames to the - `LD_LIBRARY_PATH` environment variable as described in the - NVIDIA documentation. - * [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see - [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/). - Ensure that you create the `CUDA_HOME` environment variable as - described in the NVIDIA documentation. - * GPU card with CUDA Compute Capability 3.0 or higher for building - from source and 3.5 or higher for our binaries. See - [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for - a list of supported GPU cards. - * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA - Toolkit. - * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface. - This library provides advanced profiling support. To install this library, - issue the following command for CUDA Toolkit >= 8.0: - -
-    $ sudo apt-get install cuda-command-line-tools
-    
- - and add its path to your `LD_LIBRARY_PATH` environment variable: - -
-    $ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64
-    
- - For CUDA Toolkit <= 7.5 do: - -
-    $ sudo apt-get install libcupti-dev
-    
- - * **[OPTIONAL]** For optimized inferencing performance, you can also install - **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed - for use with the pre-built `tensorflow-gpu` package can be installed as follows: - -
-    $ wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb
-    $ sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb
-    $ sudo apt-get update
-    $ sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0
-    
- - **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu` - package, please use the Ubuntu **14.04** package of TensorRT as shown above, - even when installing onto an Ubuntu 16.04 system.
-
- To build the TensorFlow-TensorRT integration module from source rather than - using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow). - For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).
-
- To avoid cuDNN version conflicts during later system upgrades, you can hold - the cuDNN version at 7.0.5: - -
-    $  sudo apt-mark hold libcudnn7 libcudnn7-dev
-    
- - To later allow upgrades, you can remove the hold: - -
-    $  sudo apt-mark unhold libcudnn7 libcudnn7-dev
-    
- -If you have an earlier version of the preceding packages, please upgrade to -the specified versions. If upgrading is not possible, then you may still run -TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}. +* __TensorFlow with CPU support only__. If your system does not have a + NVIDIA® GPU, you must install this version. This version of TensorFlow is + usually easier to install, so even if you have an NVIDIA GPU, we recommend + installing this version first. +* __TensorFlow with GPU support__. TensorFlow programs usually run much faster on + a GPU instead of a CPU. If you run performance-critical applications and your + system has an NVIDIA® GPU that meets the prerequisites, you should install + this version. See [TensorFlow GPU support](#NVIDIARequirements) for details. ## How to install TensorFlow @@ -131,8 +50,8 @@ On Ubuntu, Python is automatically installed and `pip` is *usually* installed. Confirm the `python` and `pip` versions:
-  python -V
-  pip -V  # or: pip3 -V
+  python -V  # or: python3 -V
+  pip -V     # or: pip3 -V
 
To install these packages on Ubuntu: @@ -264,8 +183,8 @@ On Ubuntu, Python is automatically installed and `pip` is *usually* installed. Confirm the `python` and `pip` versions:
-  python -V
-  pip -V  # or: pip3 -V
+  python -V  # or: python3 -V
+  pip -V     # or: pip3 -V
 
To install these packages on Ubuntu: @@ -578,6 +497,85 @@ If you are new to machine learning, we recommend the following: * @{$get_started/eager} + +## TensorFlow GPU support + +To install TensorFlow with GPU support, configure the following NVIDIA® software +on your system: + +* [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see + [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/). + Append the relevant CUDA pathnames to the `LD_LIBRARY_PATH` environmental + variable as described in the NVIDIA documentation. +* [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see + [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/). + Create the `CUDA_HOME` environment variable as described in the NVIDIA + documentation. +* A GPU card with CUDA Compute Capability 3.0 or higher for building TensorFlow + from source. To use the TensorFlow binaries, version 3.5 or higher is required. + See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a + list of supported GPU cards. +* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA + Toolkit. +* The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This + library provides advanced profiling support. To install this library, + use the following command for CUDA Toolkit >= 8.0: + +
+  sudo apt-get install cuda-command-line-tools
+
+ +Add this path to the `LD_LIBRARY_PATH` environmental variable: + +
+  export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64
+
+ +For CUDA Toolkit <= 7.5 use: + +
+  sudo apt-get install libcupti-dev
+
+ +* *OPTIONAL*: For optimized performance during inference, install + *NVIDIA TensorRT 3.0*. To install the minimal amount of TensorRT + runtime components required to use with the pre-built `tensorflow-gpu` package: + +
+  wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb
+  sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb
+  sudo apt-get update
+  sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0
+
+ +Note: For compatibility with the pre-built `tensorflow-gpu` package, use the +Ubuntu *14.04* package of TensorRT (shown above). Use this even when installing +on an Ubuntu 16.04 system. + +To build the TensorFlow-TensorRT integration module from source instead of using +the pre-built binaries, see the +[module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow). +For detailed TensorRT installation instructions, see +[NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html). + +To avoid cuDNN version conflicts during later system upgrades, hold the cuDNN +version at 7.0.5: + +
+  sudo apt-mark hold libcudnn7 libcudnn7-dev
+
+ +To allow upgrades, remove the this hold: + +
+  sudo apt-mark unhold libcudnn7 libcudnn7-dev
+
+ +If you have an earlier version of the preceding packages, upgrade to the +specified versions. If upgrading is not possible, you can still run TensorFlow +with GPU support by @{$install_sources}. + + ## Common installation problems We are relying on Stack Overflow to document TensorFlow installation problems From 184c8306a4a3d41f42f077b4898933500d61ce86 Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Tue, 24 Apr 2018 14:52:59 -0700 Subject: [PATCH 0691/1734] Add deprecation notice to replicate_model_fn. PiperOrigin-RevId: 194150426 --- tensorflow/contrib/estimator/BUILD | 1 + .../estimator/python/estimator/replicate_model_fn.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD index 62ddb3d290e..b473de86ee8 100644 --- a/tensorflow/contrib/estimator/BUILD +++ b/tensorflow/contrib/estimator/BUILD @@ -367,6 +367,7 @@ py_library( "//tensorflow/python:sparse_tensor", "//tensorflow/python:state_ops", "//tensorflow/python:training", + "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python/estimator:export_output", "//tensorflow/python/estimator:model_fn", diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py index a8774d6dab9..f8564446e5d 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py @@ -47,8 +47,12 @@ from tensorflow.python.ops.losses import losses from tensorflow.python.platform import tf_logging from tensorflow.python.training import device_setter as device_setter_lib from tensorflow.python.training import optimizer as optimizer_lib +from tensorflow.python.util import deprecation +@deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def replicate_model_fn(model_fn, loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, devices=None): @@ -255,6 +259,9 @@ class TowerOptimizer(optimizer_lib.Optimizer): COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states' + @deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def __init__(self, optimizer_or_optimizer_fn): """Wrap an existing optimizer for gathering gradients across towers. From c13af7d5a2bde4cedd28336e688f15d9bc0d886c Mon Sep 17 00:00:00 2001 From: gracehoney <31743510+aaroey@users.noreply.github.com> Date: Tue, 24 Apr 2018 14:55:47 -0700 Subject: [PATCH 0692/1734] Fix a bug where string::substr is used with wrong position. --- .../contrib/tensorrt/convert/convert_graph.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index b412b296e02..07740277115 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -111,20 +111,22 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph, } } -std::pair ParseTensorName(string name, int default_idx = 0) { +std::pair ParseTensorName(const string& name, + int default_idx = 0) { + string name_no_idx = name; int idx = default_idx; - size_t sep = name.find_last_of(':'); + const size_t sep = name_no_idx.find_last_of(':'); if (sep != string::npos) { - name = name.substr(0, sep); + name_no_idx = name_no_idx.substr(0, sep); idx = std::stoi(name.substr(sep + 1)); } - return std::make_pair(name, idx); + return std::make_pair(name_no_idx, idx); } std::unordered_map> BuildTensorNameMap( const std::vector& tensor_names) { std::unordered_map> result; - for (string const& tensor_name : tensor_names) { + for (const string& tensor_name : tensor_names) { string node_name; int index; std::tie(node_name, index) = ParseTensorName(tensor_name); @@ -132,6 +134,7 @@ std::unordered_map> BuildTensorNameMap( } return result; } + // TODO(sami): convert references to pointers struct ConvertGraphParams { ConvertGraphParams( From e7db82f821a1c522eed9e0c633df8b3db26ef38d Mon Sep 17 00:00:00 2001 From: Skye Wanderman-Milne Date: Tue, 24 Apr 2018 15:45:50 -0700 Subject: [PATCH 0693/1734] Make TF functions work with _USE_C_SHAPES=True. It turns out regular functions need to manually copy handle data in addition to eager GraphModeFunctions, so I moved the C extensions to python_api.h from eager/c_api.h. This also cleans up function_test.py to assume the C API is enabled. PiperOrigin-RevId: 194158700 --- tensorflow/c/eager/BUILD | 2 - tensorflow/c/eager/c_api.cc | 57 ------------------- tensorflow/c/eager/c_api.h | 14 ----- tensorflow/c/python_api.cc | 28 ++++++++- tensorflow/c/python_api.h | 10 +++- tensorflow/python/client/tf_session.i | 2 +- tensorflow/python/eager/function.py | 2 +- tensorflow/python/framework/function.py | 10 +++- tensorflow/python/framework/function_test.py | 37 +++--------- tensorflow/python/framework/ops.py | 4 +- .../python/ops/resource_variable_ops.py | 9 +-- tensorflow/python/pywrap_tfe.i | 2 - 12 files changed, 58 insertions(+), 119 deletions(-) diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index fae922ea3b4..14321191625 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -40,8 +40,6 @@ tf_cuda_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", - # TODO(b/74620627): move this here - "//tensorflow/python:cpp_shape_inference_proto_cc", ], }) + select({ "//tensorflow:with_xla_support": [ diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 975bde7c7f3..3bf071f3aba 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -48,7 +48,6 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/public/version.h" -#include "tensorflow/python/framework/cpp_shape_inference.pb.h" using tensorflow::int64; using tensorflow::string; @@ -503,62 +502,6 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, ctx->context.RunMetadataProto()->Clear(); } -void TFE_GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, - TF_Buffer* output_proto, - TF_Status* status) { - tensorflow::Node* node = &output.oper->node; - tensorflow::CppShapeInferenceResult::HandleData handle_data; - handle_data.set_is_set(true); - { - tensorflow::mutex_lock l(graph->mu); - tensorflow::shape_inference::InferenceContext* ic = - graph->refiner.GetContext(node); - CHECK(ic != nullptr); - CHECK_LT(output.index, ic->num_outputs()); - const auto* shapes_and_types = - ic->output_handle_shapes_and_types(output.index); - if (shapes_and_types == nullptr) { - output_proto->data = nullptr; - output_proto->length = 0; - output_proto->data_deallocator = nullptr; - return; - } - - for (const auto& p : *shapes_and_types) { - auto* out_shape_and_type = handle_data.add_shape_and_type(); - ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape()); - out_shape_and_type->set_dtype(p.dtype); - } - } - status->status = MessageToBuffer(handle_data, output_proto); -} - -void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, - const void* proto, size_t proto_len, - TF_Status* status) { - tensorflow::CppShapeInferenceResult::HandleData handle_data; - if (!handle_data.ParseFromArray(proto, proto_len)) { - status->status = tensorflow::errors::InvalidArgument( - "Couldn't deserialize HandleData proto"); - return; - } - DCHECK(handle_data.is_set()); - - tensorflow::mutex_lock l(graph->mu); - tensorflow::shape_inference::InferenceContext* ic = - graph->refiner.GetContext(&output.oper->node); - - std::vector shapes_and_types; - for (const auto& shape_and_type_proto : handle_data.shape_and_type()) { - tensorflow::shape_inference::ShapeHandle shape; - status->status = - ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); - if (status->status.ok()) return; - shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); - } - ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); -} - namespace { TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func, TF_Status* status) { diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index ba77f3cd07f..c06ce84a8c5 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -329,20 +329,6 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, TF_Status* status); -// Returns the serialized CppShapeInferenceResult::HandleData proto for -// `output` if its a resource tensor, or otherwise returns an empty buffer. -TF_CAPI_EXPORT extern void TFE_GetResourceHandleShapeAndType( - TF_Graph* graph, TF_Output output, TF_Buffer* output_proto, - TF_Status* status); - -// Sets `output` based on `proto`, which should be a serialized -// CppShapeInferenceResult::HandleData proto. -TF_CAPI_EXPORT extern void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, - TF_Output output, - const void* proto, - size_t proto_len, - TF_Status* status); - #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc index 93155998b86..e18fdf6c57b 100644 --- a/tensorflow/c/python_api.cc +++ b/tensorflow/c/python_api.cc @@ -110,7 +110,7 @@ void ExtendSession(TF_Session* session, TF_Status* status) { session->extend_before_run = false; } -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { Node* node = &output.oper->node; CppShapeInferenceResult::HandleData handle_data; handle_data.set_is_set(true); @@ -135,4 +135,30 @@ std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { return result; } +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status) { + tensorflow::CppShapeInferenceResult::HandleData handle_data; + if (!handle_data.ParseFromArray(proto, proto_len)) { + status->status = tensorflow::errors::InvalidArgument( + "Couldn't deserialize HandleData proto"); + return; + } + DCHECK(handle_data.is_set()); + + tensorflow::mutex_lock l(graph->mu); + tensorflow::shape_inference::InferenceContext* ic = + graph->refiner.GetContext(&output.oper->node); + + std::vector shapes_and_types; + for (const auto& shape_and_type_proto : handle_data.shape_and_type()) { + tensorflow::shape_inference::ShapeHandle shape; + status->status = + ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); + if (status->status.ok()) return; + shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); + } + ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); +} + } // namespace tensorflow diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h index 2d4c8cd9ed7..4bcb5bde62c 100644 --- a/tensorflow/c/python_api.h +++ b/tensorflow/c/python_api.h @@ -55,9 +55,15 @@ void ExtendSession(TF_Session* session, TF_Status* status); // Returns the serialized CppShapeInferenceResult::HandleData proto for // `output` if its a resource tensor, or otherwise returns the empty string. -// TODO(b/74620627): remove when _USE_C_SHAPES is removed -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); +// Sets `output` based on `proto`, which should be a serialized +// CppShapeInferenceResult::HandleData proto. +// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string +// because I couldn't get SWIG to work otherwise. +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status); } // namespace tensorflow #endif // TENSORFLOW_C_PYTHON_API_H_ diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i index b82182d5d36..1db1432d652 100644 --- a/tensorflow/python/client/tf_session.i +++ b/tensorflow/python/client/tf_session.i @@ -458,7 +458,7 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{ } // Override default py3 behavior of attempting to encode into Unicode. -%typemap(out) std::string tensorflow::ResourceHandleShapeAndType { +%typemap(out) std::string tensorflow::GetResourceHandleShapeAndType { $result = PyBytes_FromStringAndSize($1.data(), $1.size()); } diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index b924448abe6..bdbbe864df9 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -80,7 +80,7 @@ def capture_value(tensor_map, value, dtype, name): if handle_data is not None and handle_data.is_set: # pylint: disable=protected-access if ops._USE_C_SHAPES: - pywrap_tensorflow.TFE_SetResourceHandleShapeAndType( + pywrap_tensorflow.SetResourceHandleShapeAndType( captured_value.graph._c_graph, captured_value._as_tf_output(), handle_data.SerializeToString()) else: diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py index 9570f009a5c..f343edc4839 100644 --- a/tensorflow/python/framework/function.py +++ b/tensorflow/python/framework/function.py @@ -703,7 +703,15 @@ class _FuncGraph(ops.Graph): with ops.control_dependencies(None): ph = array_ops.placeholder(tensor.dtype, shape=tensor.get_shape()) # pylint: disable=protected-access - ph._handle_data = tensor._handle_data + if ops._USE_C_SHAPES: + handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph, + tensor._as_tf_output()) + if handle_data: + c_api.SetResourceHandleShapeAndType(ph.graph._c_graph, + ph._as_tf_output(), + compat.as_bytes(handle_data)) + else: + ph._handle_data = tensor._handle_data # pylint: enable=protected-access self._captured[tensor] = ph self.extra_args.append(ph) diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py index d6bc14fbc75..cfdacee54f5 100644 --- a/tensorflow/python/framework/function_test.py +++ b/tensorflow/python/framework/function_test.py @@ -85,7 +85,7 @@ def _OptimizerOptions(): yield cfg -@test_util.with_c_api +@test_util.with_c_shapes class FunctionTest(test.TestCase): """Test methods for verifying Function support. @@ -431,7 +431,6 @@ class FunctionTest(test.TestCase): "assertion failed.*-3"): self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0) - @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testAssertWrapper(self): @function.Defun(dtypes.float32) @@ -446,7 +445,6 @@ class FunctionTest(test.TestCase): "assertion"): _ = MyFn(100.0).eval() - @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testWhileLoopCallsFunc(self): with self.test_session(use_gpu=True) as sess: @@ -466,7 +464,6 @@ class FunctionTest(test.TestCase): ans = sess.run(loop) self.assertAllClose(ans, 131072.) - @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testControlFlowStrictness(self): """Inlined functions must not execute in a untaken control flow branch.""" @@ -1054,7 +1051,7 @@ class FunctionTest(test.TestCase): self.assertEqual((42.0, 44.0), sess.run((f_0, f_1))) -@test_util.with_c_api +@test_util.with_c_shapes class FunctionsFromProtos(test.TestCase): def expectFunctionsEqual(self, func, grad_func=None, new_func=None): @@ -1256,7 +1253,7 @@ class FunctionsFromProtos(test.TestCase): FunctionWithAttr.definition.attr["experimental_tag"].s, b"tag_value") -@test_util.with_c_api +@test_util.with_c_shapes class FunctionOverloadTest(test.TestCase): def testBasic(self): @@ -1309,7 +1306,7 @@ class FunctionOverloadTest(test.TestCase): "Successor of x.") -@test_util.with_c_api +@test_util.with_c_shapes class FunctionCaptureByValueTest(test.TestCase): def testCaptureByValue(self): @@ -1339,7 +1336,7 @@ class FunctionCaptureByValueTest(test.TestCase): self.assertAllEqual(y.eval(), [[12.0]]) -@test_util.with_c_api +@test_util.with_c_shapes class UnrollLSTMTest(test.TestCase): BATCH_SIZE = 16 LSTM_DIMS = 32 @@ -1475,7 +1472,7 @@ class UnrollLSTMTest(test.TestCase): self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4) -@test_util.with_c_api +@test_util.with_c_shapes class FunctionInlineControlTest(test.TestCase): def testFoo(self): @@ -1543,10 +1540,6 @@ def Linear2(w1, b1, w2, b2, x): return Linear(w2, b2, Linear(w1, b1, x)) -# Set C API before defining module level functions -ops._USE_C_API = True - - @function.Defun(*[dtypes.float32] * 3) def LinearWithCApi(w, b, x): return nn_ops.relu(math_ops.matmul(x, w) + b) @@ -1557,25 +1550,9 @@ def Linear2WithCApi(w1, b1, w2, b2, x): return LinearWithCApi(w2, b2, LinearWithCApi(w1, b1, x)) -# Unset C API after defining module level functions -ops._USE_C_API = False - - class ModuleFunctionTest(test.TestCase): def testBasic(self): - with ops.Graph().as_default(): - a, b, c, d, e = [ - constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5) - ] - y = Linear(a, b, c) - z = Linear2(a, b, c, d, e) - with session.Session() as sess: - self.assertAllEqual([[1]], sess.run(y)) - self.assertAllEqual([[5]], sess.run(z)) - - @test_util.enable_c_api - def testBasicWithCApi(self): with ops.Graph().as_default(): a, b, c, d, e = [ constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5) @@ -1587,7 +1564,7 @@ class ModuleFunctionTest(test.TestCase): self.assertAllEqual([[5]], sess.run(z)) -@test_util.with_c_api +@test_util.with_c_shapes class VariableHoistingTest(test.TestCase): def _testSimpleModel(self, use_forward_func, use_resource=False): diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 8cd6820f6a5..16a8c575c66 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -2557,8 +2557,8 @@ def _set_shape_and_handle_data_for_outputs_c_api(op): output._shape_val = output._c_api_shape() # Set the resource handle data for compatibility with the Python shape # inference code. - serialized = c_api.ResourceHandleShapeAndType( - op._graph._c_graph, output._as_tf_output()) + serialized = c_api.GetResourceHandleShapeAndType(op._graph._c_graph, + output._as_tf_output()) if serialized: output._handle_data = ( cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py index 4d26b2f46e3..1e953f658fc 100644 --- a/tensorflow/python/ops/resource_variable_ops.py +++ b/tensorflow/python/ops/resource_variable_ops.py @@ -24,7 +24,6 @@ from tensorflow.core.framework import variable_pb2 from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context from tensorflow.python.eager import tape -from tensorflow.python.framework import c_api_util from tensorflow.python.framework import cpp_shape_inference_pb2 from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -47,13 +46,11 @@ def get_resource_handle_data(graph_op): assert ops._USE_C_SHAPES # pylint: disable=protected-access assert type(graph_op) == ops.Tensor # pylint: disable=unidiomatic-typecheck - with c_api_util.tf_buffer() as buf: - pywrap_tensorflow.TFE_GetResourceHandleShapeAndType( - graph_op.graph._c_graph, graph_op._as_tf_output(), buf) # pylint: disable=protected-access - data = pywrap_tensorflow.TF_GetBuffer(buf) + handle_data = pywrap_tensorflow.GetResourceHandleShapeAndType( + graph_op.graph._c_graph, graph_op._as_tf_output()) # pylint: disable=protected-access return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString( - compat.as_bytes(data)) + compat.as_bytes(handle_data)) def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode): diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i index 0982a67deeb..5ee55301df9 100644 --- a/tensorflow/python/pywrap_tfe.i +++ b/tensorflow/python/pywrap_tfe.i @@ -59,8 +59,6 @@ limitations under the License. %rename("%s") TFE_ContextOptionsSetAsync; %rename("%s") TFE_DeleteContextOptions; %rename("%s") TFE_Py_TensorShapeSlice; -%rename("%s") TFE_GetResourceHandleShapeAndType; -%rename("%s") TFE_SetResourceHandleShapeAndType; %{ #include "tensorflow/python/eager/pywrap_tfe.h" From d85610e5d25b4a9150446841d659a17ae1673ddd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 15:49:53 -0700 Subject: [PATCH 0694/1734] Fix flaky timeouts in metric_ops_test by sharding more. PiperOrigin-RevId: 194159328 --- tensorflow/contrib/metrics/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD index 5ca42f41c1c..e050f3c8d4f 100644 --- a/tensorflow/contrib/metrics/BUILD +++ b/tensorflow/contrib/metrics/BUILD @@ -77,7 +77,7 @@ py_test( py_test( name = "metric_ops_test", srcs = ["python/ops/metric_ops_test.py"], - shard_count = 3, + shard_count = 8, srcs_version = "PY2AND3", tags = ["noasan"], # times out b/63678675 deps = [ From 29b23ba7afe79035eacf04886aa2636a093f12fb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 15:50:49 -0700 Subject: [PATCH 0695/1734] Add support for tensors to numpy array related assertion methods in test_util.TensorflowTestCase. PiperOrigin-RevId: 194159512 --- tensorflow/python/framework/test_util.py | 209 +++++++++++++++++- tensorflow/python/framework/test_util_test.py | 193 ++++++++++++++++ 2 files changed, 395 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 5a8bc437273..dc56d88066c 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -21,6 +21,7 @@ from __future__ import print_function import contextlib import gc +import itertools import math import random import re @@ -1212,8 +1213,14 @@ class TensorFlowTestCase(googletest.TestCase): self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err), msg=msg) def _GetNdArray(self, a): + # If a is a tensor then convert it to ndarray + if isinstance(a, ops.Tensor): + if isinstance(a, ops._EagerTensorBase): + return a.numpy() + else: + a = self.evaluate(a) if not isinstance(a, np.ndarray): - a = np.array(a) + return np.array(a) return a def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None): @@ -1286,8 +1293,8 @@ class TensorFlowTestCase(googletest.TestCase): # Try to directly compare a, b as ndarrays; if not work, then traverse # through the sequence, which is more expensive. try: - a_as_ndarray = np.array(a) - b_as_ndarray = np.array(b) + a_as_ndarray = self._GetNdArray(a) + b_as_ndarray = self._GetNdArray(b) self._assertArrayLikeAllClose( a_as_ndarray, b_as_ndarray, @@ -1322,16 +1329,18 @@ class TensorFlowTestCase(googletest.TestCase): raise def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None): - """Asserts that two structures of numpy arrays, have near values. + """Asserts that two structures of numpy arrays or Tensors, have near values. `a` and `b` can be arbitrarily nested structures. A layer of a nested structure can be a `dict`, `namedtuple`, `tuple` or `list`. Args: a: The expected numpy `ndarray`, or anything that can be converted into a - numpy `ndarray`, or any arbitrarily nested of structure of these. + numpy `ndarray` (including Tensor), or any arbitrarily nested of + structure of these. b: The actual numpy `ndarray`, or anything that can be converted into a - numpy `ndarray`, or any arbitrarily nested of structure of these. + numpy `ndarray` (including Tensor), or any arbitrarily nested of + structure of these. rtol: relative tolerance. atol: absolute tolerance. msg: Optional message to report on failure. @@ -1391,8 +1400,26 @@ class TensorFlowTestCase(googletest.TestCase): self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg) + def assertNotAllClose(self, a, b, **kwargs): + """Assert that two numpy arrays, or or Tensors, do not have near values. + + Args: + a: the first value to compare. + b: the second value to compare. + **kwargs: additional keyword arguments to be passed to the underlying + `assertAllClose` call. + + Raises: + AssertionError: If `a` and `b` are unexpectedly close at all elements. + """ + try: + self.assertAllClose(a, b, **kwargs) + except AssertionError: + return + raise AssertionError("The two values are close at all elements") + def assertAllEqual(self, a, b, msg=None): - """Asserts that two numpy arrays have the same values. + """Asserts that two numpy arrays or Tensors have the same values. Args: a: the expected numpy ndarray or anything can be converted to one. @@ -1424,6 +1451,174 @@ class TensorFlowTestCase(googletest.TestCase): print("not equal rhs = ", y) np.testing.assert_array_equal(a, b, err_msg=msg) + def assertAllGreater(self, a, comparison_target): + """Assert element values are all greater than a target value. + + Args: + a: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + comparison_target: The target value of comparison. + """ + a = self._GetNdArray(a) + self.assertGreater(np.min(a), comparison_target) + + def assertAllLess(self, a, comparison_target): + """Assert element values are all greater than a target value. + + Args: + a: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + comparison_target: The target value of comparison. + """ + a = self._GetNdArray(a) + self.assertLess(np.max(a), comparison_target) + + def assertAllGreaterEqual(self, a, comparison_target): + """Assert element values are all greater than a target value. + + Args: + a: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + comparison_target: The target value of comparison. + """ + a = self._GetNdArray(a) + self.assertGreaterEqual(np.min(a), comparison_target) + + def assertAllLessEqual(self, a, comparison_target): + """Assert element values are all greater than a target value. + + Args: + a: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + comparison_target: The target value of comparison. + """ + a = self._GetNdArray(a) + self.assertLessEqual(np.max(a), comparison_target) + + def _format_subscripts(self, subscripts, value, limit=10, indent=2): + """Generate a summary of ndarray subscripts as a list of str. + + If limit == N, this method will print up to the first N subscripts on + separate + lines. A line of ellipses (...) will be appended at the end if the number of + subscripts exceeds N. + + Args: + subscripts: The tensor (np.ndarray) subscripts, of the same format as + np.where()'s return value, i.e., a tuple of arrays with each array + corresponding to a dimension. E.g., (array([1, 1]), array([0, 1])). + value: (np.ndarray) value of the tensor. + limit: (int) The maximum number of indices to print. + indent: (int) Number of characters to indent at the beginning of each + line. + + Returns: + (list of str) the multi-line representation of the subscripts and values, + potentially with omission at the end. + """ + lines = [] + subscripts = np.transpose(subscripts) + prefix = " " * indent + for subscript in itertools.islice(subscripts, limit): + lines.append(prefix + str(subscript) + " : " + + str(value[tuple(subscript)])) + if len(subscripts) > limit: + lines.append(prefix + "...") + return lines + + def assertAllInRange(self, + target, + lower_bound, + upper_bound, + open_lower_bound=False, + open_upper_bound=False): + """Assert that elements in a Tensor are all in a given range. + + Args: + target: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + lower_bound: lower bound of the range + upper_bound: upper bound of the range + open_lower_bound: (`bool`) whether the lower bound is open (i.e., > rather + than the default >=) + open_upper_bound: (`bool`) whether the upper bound is open (i.e., < rather + than the default <=) + + Raises: + AssertionError: + if the value tensor does not have an ordered numeric type (float* or + int*), or + if there are nan values, or + if any of the elements do not fall in the specified range. + """ + target = self._GetNdArray(target) + if not (np.issubdtype(target.dtype, np.float) or + np.issubdtype(target.dtype, np.integer)): + raise AssertionError( + "The value of %s does not have an ordered numeric type, instead it " + "has type: %s" % (target, target.dtype)) + + nan_subscripts = np.where(np.isnan(target)) + if np.size(nan_subscripts): + raise AssertionError( + "%d of the %d element(s) are NaN. " + "Subscripts(s) and value(s) of the NaN element(s):\n" % + (len(nan_subscripts[0]), np.size(target)) + + "\n".join(self._format_subscripts(nan_subscripts, target))) + + range_str = (("(" if open_lower_bound else "[") + str(lower_bound) + ", " + + str(upper_bound) + (")" if open_upper_bound else "]")) + + violations = ( + np.less_equal(target, lower_bound) + if open_lower_bound else np.less(target, lower_bound)) + violations = np.logical_or( + violations, + np.greater_equal(target, upper_bound) + if open_upper_bound else np.greater(target, upper_bound)) + violation_subscripts = np.where(violations) + if np.size(violation_subscripts): + raise AssertionError( + "%d of the %d element(s) are outside the range %s. " % + (len(violation_subscripts[0]), np.size(target), range_str) + + "Subscript(s) and value(s) of the offending elements:\n" + + "\n".join(self._format_subscripts(violation_subscripts, target))) + + def assertAllInSet(self, target, expected_set): + """Assert that elements of a Tensor are all in a given closed set. + + Args: + target: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + expected_set: (`list`, `tuple` or `set`) The closed set that the elements + of the value of `target` are expected to fall into. + + Raises: + AssertionError: + if any of the elements do not fall into `expected_set`. + """ + target = self._GetNdArray(target) + + # Elements in target that are not in expected_set. + diff = np.setdiff1d(target.flatten(), list(expected_set)) + if np.size(diff): + raise AssertionError("%d unique element(s) are not in the set %s: %s" % + (np.size(diff), expected_set, diff)) + + def assertDTypeEqual(self, target, expected_dtype): + """Assert ndarray data type is equal to expected. + + Args: + target: The numpy `ndarray`, or anything that can be converted into a + numpy `ndarray` (including Tensor). + expected_dtype: Expected data type. + """ + target = self._GetNdArray(target) + if not isinstance(target, list): + arrays = [target] + for arr in arrays: + self.assertEqual(arr.dtype, expected_dtype) + # pylint: disable=g-doc-return-or-yield @contextlib.contextmanager def assertRaisesWithPredicateMatch(self, exception_type, diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py index 02ffa93baee..8d492256aac 100644 --- a/tensorflow/python/framework/test_util_test.py +++ b/tensorflow/python/framework/test_util_test.py @@ -31,13 +31,16 @@ from tensorflow.core.framework import graph_pb2 from tensorflow.core.protobuf import meta_graph_pb2 from tensorflow.python.eager import context from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import test_ops # pylint: disable=unused-import from tensorflow.python.framework import test_util from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables from tensorflow.python.platform import googletest @@ -209,6 +212,21 @@ class TestUtilTest(test_util.TensorFlowTestCase): self._WeMustGoDeeper("name") self._WeMustGoDeeper("orig") + def testAllCloseTensors(self): + a_raw_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + a = constant_op.constant(a_raw_data) + b = math_ops.add(1, constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]])) + self.assertAllClose(a, b) + self.assertAllClose(a, a_raw_data) + + a_dict = {"key": a} + b_dict = {"key": b} + self.assertAllClose(a_dict, b_dict) + + x_list = [a, b] + y_list = [a_raw_data, b] + self.assertAllClose(x_list, y_list) + def testAllCloseScalars(self): self.assertAllClose(7, 7 + 1e-8) with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"): @@ -317,6 +335,12 @@ class TestUtilTest(test_util.TensorFlowTestCase): rtol=1e-8, atol=1e-8 ) + self.assertAllCloseAccordingToType( + constant_op.constant([1e-8], dtype=dtypes.float64), + constant_op.constant([2e-8], dtype=dtypes.float64), + rtol=1e-8, + atol=1e-8) + with (self.assertRaises(AssertionError)): self.assertAllCloseAccordingToType( np.asarray([1e-7], dtype=np.float64), @@ -332,6 +356,14 @@ class TestUtilTest(test_util.TensorFlowTestCase): float_rtol=1e-7, float_atol=1e-7 ) + self.assertAllCloseAccordingToType( + constant_op.constant([1e-7], dtype=dtypes.float32), + constant_op.constant([2e-7], dtype=dtypes.float32), + rtol=1e-8, + atol=1e-8, + float_rtol=1e-7, + float_atol=1e-7) + with (self.assertRaises(AssertionError)): self.assertAllCloseAccordingToType( np.asarray([1e-6], dtype=np.float32), @@ -349,6 +381,16 @@ class TestUtilTest(test_util.TensorFlowTestCase): half_rtol=1e-4, half_atol=1e-4 ) + self.assertAllCloseAccordingToType( + constant_op.constant([1e-4], dtype=dtypes.float16), + constant_op.constant([2e-4], dtype=dtypes.float16), + rtol=1e-8, + atol=1e-8, + float_rtol=1e-7, + float_atol=1e-7, + half_rtol=1e-4, + half_atol=1e-4) + with (self.assertRaises(AssertionError)): self.assertAllCloseAccordingToType( np.asarray([1e-3], dtype=np.float16), @@ -358,6 +400,157 @@ class TestUtilTest(test_util.TensorFlowTestCase): half_rtol=1e-4, half_atol=1e-4 ) + def testAssertAllEqual(self): + i = variables.Variable([100] * 3, dtype=dtypes.int32, name="i") + j = constant_op.constant([20] * 3, dtype=dtypes.int32, name="j") + k = math_ops.add(i, j, name="k") + + self.evaluate(variables.global_variables_initializer()) + self.assertAllEqual([120] * 3, k) + self.assertAllEqual([20] * 3, j) + + def testAssertNotAllClose(self): + # Test with arrays + self.assertNotAllClose([0.1], [0.2]) + with self.assertRaises(AssertionError): + self.assertNotAllClose([-1.0, 2.0], [-1.0, 2.0]) + + # Test with tensors + x = constant_op.constant([1.0, 1.0], name="x") + y = math_ops.add(x, x) + + self.assertAllClose([2.0, 2.0], y) + self.assertNotAllClose([0.9, 1.0], x) + + with self.assertRaises(AssertionError): + self.assertNotAllClose([1.0, 1.0], x) + + def testAssertNotAllCloseRTol(self): + # Test with arrays + with self.assertRaises(AssertionError): + self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], rtol=0.2) + + # Test with tensors + x = constant_op.constant([1.0, 1.0], name="x") + y = math_ops.add(x, x) + + self.assertAllClose([2.0, 2.0], y) + + with self.assertRaises(AssertionError): + self.assertNotAllClose([0.9, 1.0], x, rtol=0.2) + + def testAssertNotAllCloseATol(self): + # Test with arrays + with self.assertRaises(AssertionError): + self.assertNotAllClose([1.1, 2.1], [1.0, 2.0], atol=0.2) + + # Test with tensors + x = constant_op.constant([1.0, 1.0], name="x") + y = math_ops.add(x, x) + + self.assertAllClose([2.0, 2.0], y) + + with self.assertRaises(AssertionError): + self.assertNotAllClose([0.9, 1.0], x, atol=0.2) + + def testAssertAllGreaterLess(self): + x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32) + y = constant_op.constant([10.0] * 3, dtype=dtypes.float32) + z = math_ops.add(x, y) + + self.assertAllClose([110.0, 120.0, 130.0], z) + + self.assertAllGreater(x, 95.0) + self.assertAllLess(x, 125.0) + + with self.assertRaises(AssertionError): + self.assertAllGreater(x, 105.0) + with self.assertRaises(AssertionError): + self.assertAllGreater(x, 125.0) + + with self.assertRaises(AssertionError): + self.assertAllLess(x, 115.0) + with self.assertRaises(AssertionError): + self.assertAllLess(x, 95.0) + + def testAssertAllGreaterLessEqual(self): + x = constant_op.constant([100.0, 110.0, 120.0], dtype=dtypes.float32) + y = constant_op.constant([10.0] * 3, dtype=dtypes.float32) + z = math_ops.add(x, y) + + self.assertAllEqual([110.0, 120.0, 130.0], z) + + self.assertAllGreaterEqual(x, 95.0) + self.assertAllLessEqual(x, 125.0) + + with self.assertRaises(AssertionError): + self.assertAllGreaterEqual(x, 105.0) + with self.assertRaises(AssertionError): + self.assertAllGreaterEqual(x, 125.0) + + with self.assertRaises(AssertionError): + self.assertAllLessEqual(x, 115.0) + with self.assertRaises(AssertionError): + self.assertAllLessEqual(x, 95.0) + + def testAssertAllInRangeWithNonNumericValuesFails(self): + s1 = constant_op.constant("Hello, ", name="s1") + c = constant_op.constant([1 + 2j, -3 + 5j], name="c") + b = constant_op.constant([False, True], name="b") + + with self.assertRaises(AssertionError): + self.assertAllInRange(s1, 0.0, 1.0) + with self.assertRaises(AssertionError): + self.assertAllInRange(c, 0.0, 1.0) + with self.assertRaises(AssertionError): + self.assertAllInRange(b, 0, 1) + + def testAssertAllInRange(self): + x = constant_op.constant([10.0, 15.0], name="x") + self.assertAllInRange(x, 10, 15) + + with self.assertRaises(AssertionError): + self.assertAllInRange(x, 10, 15, open_lower_bound=True) + with self.assertRaises(AssertionError): + self.assertAllInRange(x, 10, 15, open_upper_bound=True) + with self.assertRaises(AssertionError): + self.assertAllInRange( + x, 10, 15, open_lower_bound=True, open_upper_bound=True) + + def testAssertAllInRangeErrorMessageEllipses(self): + x_init = np.array([[10.0, 15.0]] * 12) + x = constant_op.constant(x_init, name="x") + with self.assertRaises(AssertionError): + self.assertAllInRange(x, 5, 10) + + def testAssertAllInRangeDetectsNaNs(self): + x = constant_op.constant( + [[np.nan, 0.0], [np.nan, np.inf], [np.inf, np.nan]], name="x") + with self.assertRaises(AssertionError): + self.assertAllInRange(x, 0.0, 2.0) + + def testAssertAllInRangeWithInfinities(self): + x = constant_op.constant([10.0, np.inf], name="x") + self.assertAllInRange(x, 10, np.inf) + with self.assertRaises(AssertionError): + self.assertAllInRange(x, 10, np.inf, open_upper_bound=True) + + def testAssertAllInSet(self): + b = constant_op.constant([True, False], name="b") + x = constant_op.constant([13, 37], name="x") + + self.assertAllInSet(b, [False, True]) + self.assertAllInSet(b, (False, True)) + self.assertAllInSet(b, {False, True}) + self.assertAllInSet(x, [0, 13, 37, 42]) + self.assertAllInSet(x, (0, 13, 37, 42)) + self.assertAllInSet(x, {0, 13, 37, 42}) + + with self.assertRaises(AssertionError): + self.assertAllInSet(b, [False]) + with self.assertRaises(AssertionError): + self.assertAllInSet(x, (42,)) + def testRandomSeed(self): # Call setUp again for WithCApi case (since it makes a new defeault graph # after setup). From a8654769c1faf6327b715edae614eb48775394a1 Mon Sep 17 00:00:00 2001 From: anj-s <32556631+anj-s@users.noreply.github.com> Date: Tue, 24 Apr 2018 16:28:41 -0700 Subject: [PATCH 0696/1734] 1.8r Cherrypick request-cherrypicks_30740: Fix for dropped metrics in evaluate function for Keras models. (#18799) --- .../keras/_impl/keras/engine/training.py | 29 ++------- .../_impl/keras/engine/training_eager.py | 39 ++++-------- .../_impl/keras/engine/training_eager_test.py | 11 ++-- .../keras/_impl/keras/engine/training_test.py | 26 ++++++++ .../_impl/keras/engine/training_utils.py | 62 +++++++++++++++++++ 5 files changed, 109 insertions(+), 58 deletions(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py index 71de657da81..2b72e0e33dd 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training.py +++ b/tensorflow/python/keras/_impl/keras/engine/training.py @@ -276,6 +276,8 @@ class Model(Network): self.metrics_names.append(self.output_names[i] + '_loss') self.nested_metrics = training_utils.collect_metrics(metrics, self.output_names) + with K.name_scope('metrics'): + training_utils.populate_metric_names(self) self._feed_sample_weight_modes = [] for i in range(len(self.outputs)): self._feed_sample_weight_modes.append(None) @@ -462,7 +464,6 @@ class Model(Network): output_weighted_metrics = nested_weighted_metrics[i] def handle_metrics(metrics, weights=None): - metric_name_prefix = 'weighted_' if weights is not None else '' for metric in metrics: if metric in ('accuracy', 'acc', 'crossentropy', 'ce'): @@ -489,39 +490,19 @@ class Model(Network): metric_fn = metrics_module.categorical_accuracy elif metric in ('crossentropy', 'ce'): metric_fn = metrics_module.categorical_crossentropy - if metric in ('accuracy', 'acc'): - suffix = 'acc' - elif metric in ('crossentropy', 'ce'): - suffix = 'ce' weighted_metric_fn = training_utils.weighted_masked_objective( metric_fn) - metric_name = metric_name_prefix + suffix else: metric_fn = metrics_module.get(metric) weighted_metric_fn = training_utils.weighted_masked_objective( metric_fn) - # Get metric name as string - if hasattr(metric_fn, 'name'): - metric_name = metric_fn.name - else: - metric_name = metric_fn.__name__ - metric_name = metric_name_prefix + metric_name - + metric_name = training_utils.get_base_metric_name( + metric, weighted=weights is not None) with K.name_scope(metric_name): metric_result = weighted_metric_fn( y_true, y_pred, weights=weights, mask=masks[i]) - # Append to self.metrics_names, self.metric_tensors, - # self.stateful_metric_names - if len(self.output_names) > 1: - metric_name = '%s_%s' % (self.output_names[i], metric_name) - # Dedupe name - j = 1 - base_metric_name = metric_name - while metric_name in self.metrics_names: - metric_name = '%s_%d' % (base_metric_name, j) - j += 1 - self.metrics_names.append(metric_name) + training_utils.add_metric_name(self, metric_name, i) self.metrics_tensors.append(metric_result) # Keep track of state updates created by diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py index 695669d9ee1..ad239d6151e 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py @@ -100,7 +100,7 @@ def _eager_metrics_fn(model, outputs, targets): metric_names.append(metric_name) metric_results.append(backend.mean(metric_result)) - return metric_names, metric_results + return metric_results def _model_loss(model, inputs, targets, sample_weights=None, training=False): @@ -151,7 +151,12 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False): with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn( targets[i], outs[i], weights, mask=mask) - loss_metrics.append(backend.mean(output_loss)) + # If the number of outputs is 1 then we don't append the loss metric + # associated with each model output. When there are multiple outputs + # associated with a model, each output's loss is calculated and returned + # as part of the loss_metrics. + if len(model.outputs) > 1: + loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: @@ -274,7 +279,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None): model, inputs, targets, sample_weights=sample_weights, training=True) if not isinstance(outs, list): outs = [outs] - _, metrics_results = _eager_metrics_fn( + metrics_results = _eager_metrics_fn( model, outs, targets) if not isinstance(loss, list): loss = [loss] @@ -304,7 +309,7 @@ def test_on_batch(model, inputs, targets, sample_weights=None): model, inputs, targets, sample_weights=sample_weights, training=False) if not isinstance(outs, list): outs = [outs] - _, metrics_results = _eager_metrics_fn( + metrics_results = _eager_metrics_fn( model, outs, targets) if not isinstance(loss, list): loss = [loss] @@ -498,34 +503,12 @@ def fit_loop( for l, o in zip(out_labels, outs): batch_logs[l] = o # Required for Eager mode - metrics_names, metrics_results = _eager_metrics_fn( - model, outs, targets_batch) + metrics_results = _eager_metrics_fn(model, outs, targets_batch) batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss)) - # TODO(anjalisridhar): Move this to compile to avoid duplicate code. - # In graph mode we set the metric names in compile. However in - # Eager mode we calculate the metrics for each batch in fit_loop. - # We could calculate the metric names and functions in compile. - # This would avoid setting the callback parameters separately. - # We need to do this for the first iteration alone - for m in metrics_names: - if m not in callback_metrics: - callback_metrics.append(m) - - callbacks.set_params({ - 'batch_size': batch_size, - 'epochs': epochs, - 'steps': steps_per_epoch, - 'samples': num_train_samples, - 'verbose': verbose, - 'do_validation': do_validation, - 'metrics': callback_metrics or [], - }) - for k, v in zip(model.metrics_names, [backend.mean(loss)] + loss_metrics + metrics_results): batch_logs[k] = tensor_util.constant_value(v) - callbacks.on_batch_end(batch_index, batch_logs) if callback_model.stop_training: break @@ -611,7 +594,7 @@ def test_loop(model, inputs, targets, targets_batch, sample_weights=sample_weights_batch, training=False) - _, metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch) + metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch) batch_outs = [] for _, v in zip(model.metrics_names, [backend.mean(loss)] + loss_metrics + metrics_results): diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py index ed0f91ee1e2..c45e07e08bc 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py @@ -212,7 +212,7 @@ class TrainingTest(test.TestCase): optimizer = RMSPropOptimizer(learning_rate=0.001) loss = 'mse' loss_weights = [1., 0.5] - metrics = ['mae'] + metrics = ['acc', 'mae'] model.compile( optimizer, loss, @@ -231,20 +231,20 @@ class TrainingTest(test.TestCase): [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=0) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.evaluate( [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=1) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.evaluate( [input_a_np, input_b_np], [output_d_np, output_e_np], batch_size=5, verbose=2) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) out = model.test_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np]) - self.assertEqual(len(out), 5) + self.assertEqual(len(out), 7) # Test evaluate with dictionary inputs model.evaluate( @@ -625,7 +625,6 @@ class LossWeightingTest(test.TestCase): bad_w_np = np.random.random((10, 2, 2)) model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) - class CorrectnessTest(test.TestCase): @tf_test_util.run_in_graph_and_eager_modes() diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py index 08fd26dd18d..47d80704cf6 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py @@ -23,11 +23,14 @@ import unittest import numpy as np +from tensorflow.python.framework import test_util as tf_test_util from tensorflow.python.keras._impl import keras from tensorflow.python.keras._impl.keras import testing_utils from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays from tensorflow.python.platform import test +from tensorflow.python.training.rmsprop import RMSPropOptimizer + try: import scipy.sparse as scipy_sparse # pylint: disable=g-import-not-at-top @@ -1667,6 +1670,29 @@ class TestTrainingWithDataTensors(test.TestCase): model.train_on_batch([input_a_np, input_b_np], [output_a_np, output_b_np]) + @tf_test_util.run_in_graph_and_eager_modes() + def test_metric_names_are_identical_in_graph_and_eager(self): + a = keras.layers.Input(shape=(3,), name='input_a') + b = keras.layers.Input(shape=(3,), name='input_b') + + dense = keras.layers.Dense(4, name='dense') + c = dense(a) + d = dense(b) + e = keras.layers.Dropout(0.5, name='dropout')(c) + + model = keras.models.Model([a, b], [d, e]) + + optimizer = RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + loss_weights = [1., 0.5] + metrics = ['mae', 'acc'] + model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights) + reference_metric_names = ['loss', 'dense_loss', 'dropout_loss', + 'dense_mean_absolute_error', + 'dense_acc', + 'dropout_mean_absolute_error', + 'dropout_acc'] + self.assertEqual(reference_metric_names, model.metrics_names) if __name__ == '__main__': # Bazel sets these environment variables to very long paths. diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py index a3fc8ef2a03..34c0738f26f 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py @@ -26,6 +26,7 @@ from tensorflow.python.eager import context from tensorflow.python.framework import tensor_util from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import losses +from tensorflow.python.keras._impl.keras import metrics as metrics_module from tensorflow.python.ops import math_ops @@ -553,3 +554,64 @@ def standardize_weights(y, def has_symbolic_tensors(ls): return (any(tensor_util.is_tensor(v) for v in ls) and not context.executing_eagerly()) + + +def populate_metric_names(model): + for i in range(len(model.outputs)): + metrics = model.nested_metrics[i] + for metric in metrics: + base_metric_name = get_base_metric_name(metric) + add_metric_name(model, base_metric_name, i) + + +def get_base_metric_name(metric, weighted=False): + """Returns the metric name given the metric function. + + Arguments: + metric: Metric function name or reference. + weighted: Boolean indicating if the metric for which we are adding + names is weighted. + + Returns: + a metric name. + """ + metric_name_prefix = 'weighted_' if weighted else '' + if metric in ('accuracy', 'acc', 'crossentropy', 'ce'): + if metric in ('accuracy', 'acc'): + suffix = 'acc' + elif metric in ('crossentropy', 'ce'): + suffix = 'ce' + metric_name = metric_name_prefix + suffix + else: + metric_fn = metrics_module.get(metric) + # Get metric name as string + if hasattr(metric_fn, 'name'): + metric_name = metric_fn.name + else: + metric_name = metric_fn.__name__ + metric_name = metric_name_prefix + metric_name + + return metric_name + + +def add_metric_name(model, metric_name, index): + """Makes the metric name unique and adds it to the model's metric name list. + + If there are multiple outputs for which the metrics are calculated, the + metric names have to be made unique by appending an integer. + + Arguments: + model: Model to which we are adding metric names. + metric_name: Metric name that corresponds to the metric specified by the + user. For example: 'acc' + index: The index of the model output for which the metric name is being + added. + """ + if len(model.output_names) > 1: + metric_name = '%s_%s' % (model.output_names[index], metric_name) + j = 1 + base_metric_name = metric_name + while metric_name in model.metrics_names: + metric_name = '%s_%d' % (base_metric_name, j) + j += 1 + model.metrics_names.append(metric_name) From 2ca2390277c2a4ea2d92fb72782bf30bfe00f592 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Tue, 24 Apr 2018 16:34:01 -0700 Subject: [PATCH 0697/1734] Fixing the mock import error for devel docker. --- tensorflow/tools/docker/Dockerfile.devel | 1 + tensorflow/tools/docker/Dockerfile.devel-gpu | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index 390d7442c37..5c49ac1d8d2 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -31,6 +31,7 @@ RUN pip --no-cache-dir install \ ipykernel \ jupyter \ matplotlib \ + mock \ numpy \ scipy \ sklearn \ diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index 293028d229a..196227861b2 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -40,6 +40,7 @@ RUN pip --no-cache-dir install \ ipykernel \ jupyter \ matplotlib \ + mock \ numpy \ scipy \ sklearn \ From 2495ec22832c846b149c394aece2db19f2813b45 Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Tue, 24 Apr 2018 16:52:29 -0700 Subject: [PATCH 0698/1734] Disable UseTowerEstimatorWithoutReplication.test_train_single_tower. PiperOrigin-RevId: 194168031 --- .../estimator/replicate_model_fn_test.py | 53 ------------------- 1 file changed, 53 deletions(-) diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py index 144b45982c8..dd8a3a95f1b 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py @@ -540,59 +540,6 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer( self.assertEqual(7.0, session.run(c)) -class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase): - - def model_fn(self, mode, features, labels, params): - c = variable_scope.get_variable( - 'c', - initializer=constant_op.constant(10, dtype=dtypes.float64), - dtype=dtypes.float64) - - features = features['features'] - predictions = math_ops.multiply(features, c) - - loss = losses.absolute_difference( - labels=labels, predictions=predictions, reduction=losses.Reduction.SUM) - loss = math_ops.reduce_sum(loss) - - metrics = { - 'accuracy': metrics_lib.accuracy(labels, predictions), - 'auc': metrics_lib.auc(labels, predictions) - } - - optimizer = replicate_model_fn.TowerOptimizer( - gradient_descent.GradientDescentOptimizer(params['learning_rate'])) - - return model_fn_lib.EstimatorSpec( - mode=mode, - loss=loss, - eval_metric_ops=metrics, - predictions={'probabilities': predictions}, - train_op=optimizer.minimize(loss)) - - @property - def params(self): - params = {} - params['learning_rate'] = 1.0 - return params - - def test_train_single_tower(self): - features = np.array([[1.0], [2.0]]) - labels = np.array([[1.0], [2.0]]) - - train_input_fn = numpy_io.numpy_input_fn( - x={'features': features}, y=labels, batch_size=2, shuffle=False) - - with self.test_session(): - estimator = estimator_lib.Estimator( - model_fn=self.model_fn, - model_dir=tempfile.mkdtemp(), - params=self.params) - estimator.train(train_input_fn, steps=1) - - self.assertEqual(7.0, estimator.get_variable_value('c')) - - class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase): def model_fn(self, mode, features, labels, params): From d1d5fc27ad8d84f1468ce459ba8fab208b174c6f Mon Sep 17 00:00:00 2001 From: Francois Chollet <> Date: Tue, 24 Apr 2018 17:00:40 -0700 Subject: [PATCH 0699/1734] Fix critical metrics computation bug with Model in Eager mode. --- tensorflow/python/keras/_impl/keras/engine/training_eager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py index 4cdb5f108a0..924f74e5b66 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py @@ -96,7 +96,7 @@ def _eager_metrics_fn(model, outputs, targets): model.metrics_names.append(metric_name) with backend.name_scope(metric_name): - metric_result = metric_fn(outputs[i], targets[i]) + metric_result = metric_fn(targets[i], outputs[i]) metric_names.append(metric_name) metric_results.append(backend.mean(metric_result)) From 44203871672b85d936797cb60bab6731ad6a2824 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 24 Apr 2018 23:58:22 +0000 Subject: [PATCH 0700/1734] Enable int8 support for FloorDiv int8 is enabled for FloorDiv in math_ops.cc though the kernel was not registered. This fix register the int8 kernel for FloorDiv, and enables the test case for it. Signed-off-by: Yong Tang --- tensorflow/core/kernels/cwise_op_floor_div.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc index fecbf859897..24da61fdf6c 100644 --- a/tensorflow/core/kernels/cwise_op_floor_div.cc +++ b/tensorflow/core/kernels/cwise_op_floor_div.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16, - int16, int32, int64); +REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16, + int8, int16, int32, int64); REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float, Eigen::half, double); From 552783ec41b9cd7fa678ebc6dd1c8371c69f8974 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 25 Apr 2018 00:00:45 +0000 Subject: [PATCH 0701/1734] Add np.int8, np.int16 test cases for div tests Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/division_past_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py index 2ff2f894077..e5c86719d3c 100644 --- a/tensorflow/python/kernel_tests/division_past_test.py +++ b/tensorflow/python/kernel_tests/division_past_test.py @@ -36,7 +36,7 @@ class DivisionTestCase(test.TestCase): values = [1, 2, 7, 11] functions = (lambda x: x), constant_op.constant # TODO(irving): Test int8, int16 once we support casts for those. - dtypes = np.int32, np.int64, np.float32, np.float64 + dtypes = np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 tensors = [] checks = [] From d42d3640a48a6eecf2696d1cfe247de8f571dccb Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 25 Apr 2018 00:01:27 +0000 Subject: [PATCH 0702/1734] Remove TODO as it is done now. Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/division_past_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py index e5c86719d3c..9ddd62e63cc 100644 --- a/tensorflow/python/kernel_tests/division_past_test.py +++ b/tensorflow/python/kernel_tests/division_past_test.py @@ -35,7 +35,6 @@ class DivisionTestCase(test.TestCase): """Test all the different ways to divide.""" values = [1, 2, 7, 11] functions = (lambda x: x), constant_op.constant - # TODO(irving): Test int8, int16 once we support casts for those. dtypes = np.int8, np.int16, np.int32, np.int64, np.float32, np.float64 tensors = [] From e871ea871fc39521dfa3c9f659b1d576c835c1e9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 24 Apr 2018 17:02:46 -0700 Subject: [PATCH 0703/1734] Fixed typo in an error message. PiperOrigin-RevId: 194169339 --- tensorflow/core/kernels/string_split_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc index 9efbd66ef75..4c2b312c345 100644 --- a/tensorflow/core/kernels/string_split_op.cc +++ b/tensorflow/core/kernels/string_split_op.cc @@ -71,7 +71,7 @@ class StringSplitOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("delimiter", &delimiter_tensor)); OP_REQUIRES( ctx, TensorShapeUtils::IsScalar(delimiter_tensor->shape()), - errors::InvalidArgument("delimiter must scalar, got shape: ", + errors::InvalidArgument("delimiter must be a scalar, got shape: ", delimiter_tensor->shape().DebugString())); const auto delimiter_vec = delimiter_tensor->flat(); const string& delimiter = delimiter_vec(0); From 8b3c5e62be825d78bc25b3c4b6c65a44d47416e0 Mon Sep 17 00:00:00 2001 From: Akshay Agrawal Date: Tue, 24 Apr 2018 17:35:08 -0700 Subject: [PATCH 0704/1734] `PartitionedCallOp`: An op for executing multi-device functions. A `PartitionedCallOp` allows for execution of functions across multiple devices but within a single process. It proceeds by placing and partitioning the graph underlying a given function body, instantiating for each partitioned subgraph a function. The yielded function shards, which together are equivalent to the original function, are then executed. `PartitionedCallOp` is not part of the public TensorFlow API. PiperOrigin-RevId: 194173114 --- tensorflow/compiler/jit/BUILD | 37 --- .../jit/encapsulate_subgraphs_pass.cc | 2 +- .../jit/encapsulate_subgraphs_pass_test.cc | 2 +- tensorflow/compiler/tf2xla/BUILD | 1 - .../tf2xla/functionalize_control_flow.cc | 2 +- tensorflow/core/BUILD | 5 + .../base_api/api_def_PartitionedCall.pbtxt | 23 ++ .../python_api/api_def_PartitionedCall.pbtxt | 1 + .../framework}/graph_to_functiondef.cc | 4 +- .../framework}/graph_to_functiondef.h | 9 +- .../framework}/graph_to_functiondef_test.cc | 2 +- tensorflow/core/kernels/BUILD | 12 + .../core/kernels/partitioned_function_ops.cc | 279 ++++++++++++++++++ tensorflow/core/ops/functional_ops.cc | 9 + tensorflow/python/kernel_tests/BUILD | 1 + .../kernel_tests/functional_ops_test.py | 106 +++++++ tensorflow/python/ops/functional_ops.py | 7 +- 17 files changed, 451 insertions(+), 51 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_PartitionedCall.pbtxt rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef.cc (98%) rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef.h (79%) rename tensorflow/{compiler/jit => core/framework}/graph_to_functiondef_test.cc (98%) create mode 100644 tensorflow/core/kernels/partitioned_function_ops.cc diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 53b124cf890..af2965bba5b 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -257,19 +257,6 @@ cc_library( alwayslink = 1, ) -cc_library( - name = "graph_to_functiondef", - srcs = ["graph_to_functiondef.cc"], - hdrs = ["graph_to_functiondef.h"], - visibility = [":friends"], - deps = [ - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:protos_all_cc", - ], -) - cc_library( name = "create_xla_launch_op", srcs = [ @@ -300,7 +287,6 @@ cc_library( ], deps = [ ":common", - ":graph_to_functiondef", ":shape_inference_helpers", ":union_find", "//tensorflow/compiler/jit/graphcycles", @@ -347,28 +333,6 @@ tf_cc_test( ], ) -tf_cc_test( - name = "graph_to_functiondef_test", - size = "small", - srcs = [ - "graph_to_functiondef_test.cc", - ], - deps = [ - ":graph_to_functiondef", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:cc_ops_internal", - "//tensorflow/cc:function_ops", - "//tensorflow/cc:ops", - "//tensorflow/compiler/tf2xla:xla_compiler", - "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework_internal", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - "//tensorflow/core:testlib", - ], -) - tf_cc_test( name = "compilation_passes_test", size = "small", @@ -379,7 +343,6 @@ tf_cc_test( deps = [ ":common", ":compilation_passes", - ":graph_to_functiondef", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", "//tensorflow/cc:function_ops", diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 7507e193b56..f06debaf316 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -22,7 +22,6 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/compiler/jit/graphcycles/graphcycles.h" #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" @@ -35,6 +34,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_def_util.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/graph/algorithm.h" diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index 3502d1bb459..5ec24d39a2c 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -20,8 +20,8 @@ limitations under the License. #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/lib/core/status_test_util.h" diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index ba5c3a14849..942504e6bd4 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -412,7 +412,6 @@ cc_library( hdrs = ["functionalize_control_flow.h"], deps = [ ":tf2xla_util", - "//tensorflow/compiler/jit:graph_to_functiondef", "//tensorflow/compiler/jit:union_find", "//tensorflow/compiler/tf2xla:dump_graph", "//tensorflow/compiler/tf2xla/ops:xla_ops", diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 23629d85aed..8d1f2684909 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -21,13 +21,13 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/compiler/jit/union_find.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index bda87c6aed2..e8f10f148d3 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -545,6 +545,7 @@ tf_cuda_library( "framework/device_base.h", "framework/function.h", "framework/graph_def_util.h", + "framework/graph_to_functiondef.h", "framework/kernel_def_builder.h", "framework/log_memory.h", "framework/lookup_interface.h", @@ -999,6 +1000,7 @@ cc_library( "//tensorflow/core/kernels:nn", "//tensorflow/core/kernels:parameterized_truncated_normal_op", "//tensorflow/core/kernels:parsing", + "//tensorflow/core/kernels:partitioned_function_ops", "//tensorflow/core/kernels:random_ops", "//tensorflow/core/kernels:random_poisson_op", "//tensorflow/core/kernels:remote_fused_graph_ops", @@ -3061,6 +3063,7 @@ tf_cc_tests( "framework/common_shape_fns_test.cc", "framework/function_test.cc", "framework/graph_def_util_test.cc", + "framework/graph_to_functiondef_test.cc", "framework/kernel_def_builder_test.cc", "framework/memory_types_test.cc", "framework/node_def_builder_test.cc", @@ -3139,6 +3142,8 @@ tf_cc_tests( ":testlib", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", "//tensorflow/cc:scope", "//tensorflow/cc:sendrecv_ops", "//tensorflow/cc:while_loop", diff --git a/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt new file mode 100644 index 00000000000..caf8172a529 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_PartitionedCall.pbtxt @@ -0,0 +1,23 @@ +op { + graph_op_name: "PartitionedCall" + in_arg { + name: "args" + description: "A list of input tensors." + } + out_arg { + name: "output" + description: "A list of return values." + } + attr { name: "Tin" description: "A list of input types." } + attr { name: "Tout" description: "A list of output types." } + attr { + name: "f" + description: <